new iOS/web extractors, image proxying done by views.py

- ythdd_globals.py - added helper function to get user-configured header
- ythdd.py - now checks for config.toml in work directory
- requirements.txt - add brotli, so that requests can decompress
innertube request
This commit is contained in:
2024-12-26 20:15:45 +01:00
parent 1e4b05c33b
commit 0099736a74
7 changed files with 251 additions and 45 deletions

View File

@@ -1,15 +1,19 @@
[general]
db_file_path = "/path/to/ythdd_db.sqlite" # Preferably stored on an SSD.
video_storage_directory_path = "/path/to/videos/" # Path to video vault.
is_proxied = false
is_proxied = false # Set to true if running behind reverse proxy.
public_facing_url = "http://localhost:5000/" # Used for URL rewriting. Note the trailing backslash /.
[api]
api_key = "" # Leave empty API key for public access to non-sensitive backend
api_key_admin = "CHANGEME" # Empty *admin* API key will autogenerate a random one every launch.
[extractor]
user-agent = "" # leave empty for default
cookies_path = "" # leave empty for none
user-agent = "" # Leave empty for default (Firefox ESR).
cookies_path = "" # Leave empty for none.
[proxy]
user-agent = "" # Leave empty for default (Firefox ESR).
[admin]
# List of users with admin priviledges.

View File

@@ -12,3 +12,4 @@ toml>=0.10.2
Flask-APScheduler>=1.13.1
requests>=2.32.3
yt_dlp
brotli>=1.1.0

View File

@@ -1,8 +1,9 @@
#!/usr/bin/python3
from flask import render_template
from flask import render_template, Response
from flask_sqlalchemy import SQLAlchemy
from markupsafe import escape
import requests, json
import ythdd_globals
def homepage():
return "homepage"
@@ -12,3 +13,31 @@ def home():
def index():
return "index"
def thumbnailProxy(received_request):
# apparently, this can be set to
# https://img.youtube.com/ as well
prefix = "https://i.ytimg.com/"
if received_request.count("/") < 1 or received_request.index("/") != 11:
return Response(json.dumps({
'status': 400,
'error_msg': 'invalid request. pretend this is a thumbnail :D'
}), mimetype='application/json', status=400)
thumbnail = requests.get(prefix + "vi/" + received_request, headers=ythdd_globals.getHeaders(caller='proxy'), stream=True)
thumbnail.raw.decode_content = True
response = Response(thumbnail.raw, mimetype=thumbnail.headers['content-type'], status=thumbnail.status_code)
return response
def ggphtProxy(received_request):
prefix = "https://yt3.ggpht.com/"
ggpht = requests.get(prefix + received_request, headers=ythdd_globals.getHeaders(caller='proxy'), stream=True)
ggpht.raw.decode_content = True
response = Response(ggpht.raw, mimetype=ggpht.headers['content-type'], status=ggpht.status_code)
return response

View File

@@ -6,15 +6,18 @@ from argparse import ArgumentParser
from ythdd_globals import colors
import requests, json, toml, time
import views, downloader, ythdd_api, ythdd_globals, ythdd_db
import os
from flask_apscheduler import APScheduler
app = Flask(__name__)
app_host = "None"
app_port = "None"
def setup():
# sanity check: make sure config is set
# required to make `flask --app ythdd run --debug` work
global config
global config, app_host, app_port
try:
if not config['general']:
ythdd_globals.setConfig(ythdd_globals.configfile)
@@ -31,6 +34,25 @@ def setup():
ythdd_globals.isProxied = config['general']['is_proxied']
ythdd_globals.outsideApiHits = 0
are_we_sure_of_host_and_port = True
if app_host == "None":
app_host = "127.0.0.1"
are_we_sure_of_host_and_port = False
if app_port == "None":
app_port = "5000"
are_we_sure_of_host_and_port = False
public_facing_url = config['general']['public_facing_url']
rewrite_sanity_check = public_facing_url.replace(f"{app_host}:{app_port}", "")
if not config['general']['is_proxied'] and public_facing_url == rewrite_sanity_check:
sanity_string = f"{colors.WARNING}Heads up!{colors.ENDC} Public facing URL does not match the IP and port the server is running on.\n"
sanity_string += f" Expected: {colors.OKCYAN}{config['general']['public_facing_url']}{colors.ENDC}, but"
if not are_we_sure_of_host_and_port: sanity_string += " (assuming it's)"
sanity_string += f" running on: {colors.OKCYAN}{app_host}:{app_port}{colors.ENDC}.\n"
sanity_string += f" This is just a sanity check and may not neccessarily mean bad configuration.\n"
sanity_string += f" If you're running a reverse proxy, set {colors.OKCYAN}is_proxied{colors.ENDC} to true to silence this message.\n"
print(sanity_string)
app.config['SQLALCHEMY_DATABASE_URI'] = f"sqlite:///{config['general']['db_file_path']}"
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.add_url_rule('/', view_func=views.index)
@@ -38,6 +60,8 @@ def setup():
app.add_url_rule('/home', view_func=views.home)
app.add_url_rule('/api/', view_func=ythdd_api.api_greeting)
app.add_url_rule('/api/<path:received_request>', view_func=ythdd_api.api_global_catchall)
app.add_url_rule('/vi/<path:received_request>', view_func=views.thumbnailProxy)
app.add_url_rule('/ggpht/<path:received_request>', view_func=views.ggphtProxy)
db = ythdd_db.initDB(app, config)
with app.app_context():
@@ -81,14 +105,18 @@ def main(args):
host = host_port[0]
port = host_port[1]
global config
global config, app_host, app_port
try:
# if specified, use custom config file
ythdd_globals.configfile = args.config
ythdd_globals.setConfig(ythdd_globals.configfile)
except:
# if not, use dummy file
# if not, try using the default "config.toml"
if os.path.exists("config.toml"):
ythdd_globals.configfile = "config.toml"
else:
# unless it's not there, if that's the case then use the dummy file
ythdd_globals.configfile = ""
# but try to set the API secret if provided by the user
if args.secret:
@@ -97,6 +125,9 @@ def main(args):
config = ythdd_globals.config
app_host = host
app_port = port
setup()
app.run(host=host, port=int(port))
@@ -115,4 +146,6 @@ if __name__ == "__main__":
main(args)
else:
app_host = os.getenv("FLASK_RUN_HOST", "None")
app_port = os.getenv("FLASK_RUN_PORT", "None")
setup()

View File

@@ -113,7 +113,7 @@ def hot(data):
started = time.time()
try:
# try to actually get the data
extracted_related = ythdd_extractor.related('https://www.youtube.com/watch?v=' + videoId)
extracted_related = ythdd_extractor.WEBrelated('https://www.youtube.com/watch?v=' + videoId)
extracted_related['took'] = time.time() - started
return 200, "OK", extracted_related
except KeyError:

View File

@@ -1,5 +1,5 @@
#!/usr/bin/python3
import yt_dlp, requests, json
import brotli, yt_dlp, requests, json, time
import ythdd_globals
ytdl_opts = {
@@ -15,6 +15,89 @@ ytdl_opts = {
"simulate": True
}
stage1_headers = {
"Connection": "keep-alive",
"User-Agent": "com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-us,en;q=0.5",
"Sec-Fetch-Mode": "navigate",
"Content-Type": "application/json",
"X-Youtube-Client-Name": "5",
"X-Youtube-Client-Version": "19.45.4",
"Origin": "https://www.youtube.com",
"Accept-Encoding": "gzip, deflate, br",
"Cookie": "PREF=hl=en&tz=UTC; SOCS=CAI"
}
stage1_body = {
"context":
{
"client":
{
"clientName": "IOS",
"clientVersion": "19.45.4",
"deviceMake": "Apple",
"deviceModel": "iPhone16,2",
"userAgent": "com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)",
"osName": "iPhone",
"osVersion": "18.1.0.22B83",
"hl": "en",
"timeZone": "UTC",
"utcOffsetMinutes": 0
}
},
#"videoId": uri,
"playbackContext":
{
"contentPlaybackContext":
{
"html5Preference": "HTML5_PREF_WANTS"
}
},
"contentCheckOk": True,
"racyCheckOk": True
}
stage2_headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-us,en;q=0.5",
"Sec-Fetch-Mode": "navigate",
"Accept-Encoding": "gzip, deflate, br"
}
stage3_headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-us,en;q=0.5",
"Sec-Fetch-Mode": "navigate",
"Content-Type": "application/json",
"X-Youtube-Client-Name": "1",
"X-Youtube-Client-Version": "2.20241126.01.00",
"Origin": "https://www.youtube.com",
"Accept-Encoding": "gzip, deflate, br",
"Cookie": "PREF=hl=en&tz=UTC; SOCS=CAI"
}
stage3_body = {
"context":
{
"client":
{
"clientName": "WEB",
"clientVersion": "2.20241126.01.00",
"hl": "en",
"timeZone": "UTC",
"utcOffsetMinutes": 0
}
},
#"videoId": uri,
"contentCheckOk": True,
"racyCheckOk": True
}
def extract(url: str, getcomments=False, maxcomments=""):
# TODO: check user-agent and cookiefile
@@ -34,7 +117,7 @@ def extract(url: str, getcomments=False, maxcomments=""):
result = ytdl.extract_info(url, download=False)
return result
def related(url: str):
def WEBrelated(url: str):
# WARNING! HIGHLY EXPERIMENTAL, DUE TO BREAK ANYTIME
if len(url) == 11:
params = {'v': url}
@@ -45,30 +128,7 @@ def related(url: str):
videoId = url[32:44]
params = {'v': videoId}
# NOTE: use ESR user-agent
# user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:130.0) Gecko/20100101 Firefox/130.0'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0'
if ythdd_globals.config['extractor']['user-agent']:
user_agent = ythdd_globals.config['extractor']['user-agent']
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Sec-GPC': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Priority': 'u=0, i',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
response = requests.get(url, headers=headers, params=params)
response = requests.get(url, headers=ythdd_globals.getHeaders(caller='extractor'), params=params)
extracted_string = str(response.content.decode('utf8', 'unicode_escape'))
start = extracted_string.find('{"responseContext":{"serviceTrackingParams":')
start2 = extracted_string.find('{"responseContext":{"serviceTrackingParams":', start + 1)
@@ -76,3 +136,47 @@ def related(url: str):
extracted_json = json.loads(extracted_string[start2:end])
return extracted_json["contents"]['twoColumnWatchNextResults']["secondaryResults"]
def WEBextractSinglePage(uri: str):
# WARNING! HIGHLY EXPERIMENTAL, DUE TO BREAK ANYTIME
start_time = time.time()
if len(uri) != 11:
raise ValueError("WEBextractSinglePage expects a single, 11-character long argument")
response = requests.get("https://www.youtube.com/watch?v=" + uri, headers=ythdd_globals.getHeaders(caller='extractor'))
extracted_string = str(response.content.decode('utf8', 'unicode_escape'))
start = extracted_string.find('{"responseContext":{"serviceTrackingParams":')
end = extracted_string.find(';var ', start)
start2 = extracted_string.find('{"responseContext":{"serviceTrackingParams":', start + 1)
end2 = extracted_string.find(';</script>', start2)
extracted_json1 = json.loads(extracted_string[start:end])
extracted_json2 = json.loads(extracted_string[start2:end2])
end_time = time.time()
return {'ec1': extracted_json1, 'ec2': extracted_json2, 'took': end_time - start_time}
def IOSextract(uri: str):
start = time.time()
if len(uri) != 11:
raise ValueError("IOSextract expects a single, 11-character long uri as an argument")
stage1_body['videoId'] = uri
stage1_h = requests.post("https://www.youtube.com/youtubei/v1/player?prettyPrint=false", headers=stage1_headers, json=stage1_body)
stage1 = json.loads(stage1_h.content.decode('utf-8'))
#stage2_h = requests.get(stage1['streamingData']['hlsManifestUrl'], headers=stage2_headers)
#stage2 = stage2_h.content.decode('utf-8')
stage3_body['videoId'] = uri
stage3_h = requests.post("https://www.youtube.com/youtubei/v1/next?prettyPrint=false", headers=stage3_headers, json=stage3_body)
stage3 = json.loads(stage3_h.content.decode('utf-8'))
end = time.time()
#return {'stage1': stage1, 'stage2': stage2, 'stage3': stage3, 'took': end - start}
return {'stage1': stage1, 'stage3': stage3, 'took': end - start}

View File

@@ -30,16 +30,16 @@ def getConfig(configfile):
global randomly_generated_passcode
if not os.path.exists(configfile):
dummy_config = {'general': {'db_file_path': 'ythdd_db.sqlite', 'video_storage_directory_path': 'videos/', 'is_proxied': False}, 'api': {'api_key': 'CHANGEME'}, 'extractor': {'user-agent': '', 'cookies_path': ''}, 'admin': {'admins': ['admin']}, 'yt_dlp': {}, 'postprocessing': {'presets': [{'name': 'recommended: [N][<=720p] best V+A', 'format': 'bv[height<=720]+ba', 'reencode': ''}, {'name': '[N][1080p] best V+A', 'format': 'bv[height=1080]+ba', 'reencode': ''}, {'name': '[R][1080p] webm', 'format': 'bv[height=1080]+ba', 'reencode': 'webm'}, {'name': '[N][720p] best V+A', 'format': 'bv[height=720]+ba', 'reencode': ''}, {'name': '[R][720p] webm', 'format': 'bv[height=720]+ba', 'reencode': 'webm'}, {'name': '[N][480p] best V+A', 'format': 'bv[height=480]+ba', 'reencode': ''}, {'name': '[480p] VP9 webm/reencode', 'format': 'bv*[height=480][ext=webm]+ba/bv[height=480]+ba', 'reencode': 'webm'}, {'name': '[N][1080p] best video only', 'format': 'bv[height=1080]', 'reencode': ''}, {'name': '[N][opus] best audio only', 'format': 'ba', 'reencode': 'opus'}]}}
dummy_config = {'general': {'db_file_path': 'ythdd_db.sqlite', 'video_storage_directory_path': 'videos/', 'is_proxied': False, 'public_facing_url': 'http://localhost:5000/'}, 'api': {'api_key': 'CHANGEME'}, 'extractor': {'user-agent': '', 'cookies_path': ''}, 'admin': {'admins': ['admin']}, 'yt_dlp': {}, 'postprocessing': {'presets': [{'name': 'recommended: [N][<=720p] best V+A', 'format': 'bv[height<=720]+ba', 'reencode': ''}, {'name': '[N][1080p] best V+A', 'format': 'bv[height=1080]+ba', 'reencode': ''}, {'name': '[R][1080p] webm', 'format': 'bv[height=1080]+ba', 'reencode': 'webm'}, {'name': '[N][720p] best V+A', 'format': 'bv[height=720]+ba', 'reencode': ''}, {'name': '[R][720p] webm', 'format': 'bv[height=720]+ba', 'reencode': 'webm'}, {'name': '[N][480p] best V+A', 'format': 'bv[height=480]+ba', 'reencode': ''}, {'name': '[480p] VP9 webm/reencode', 'format': 'bv*[height=480][ext=webm]+ba/bv[height=480]+ba', 'reencode': 'webm'}, {'name': '[N][1080p] best video only', 'format': 'bv[height=1080]', 'reencode': ''}, {'name': '[N][opus] best audio only', 'format': 'ba', 'reencode': 'opus'}]}}
# if a passcode has not been provided by the user (config file doesn't exist, and user didn't specify it using an argument)
print(f"{colors.WARNING}WARNING{colors.ENDC}: Using default, baked in config data. {colors.ENDL}"
f"Consider copying and editing the provided example file ({colors.OKCYAN}config.default.toml{colors.ENDC}).")
f" Consider copying and editing the provided example file ({colors.OKCYAN}config.default.toml{colors.ENDC}).")
if randomly_generated_passcode == 0:
# generate a pseudorandom one and use it in the temporary config
randomly_generated_passcode = str(int(time.time() * 1337 % 899_999 + 100_000))
print(f"{colors.WARNING}WARNING{colors.ENDC}: Default config populated with one-time, insecure pseudorandom admin API key: {colors.OKCYAN}{randomly_generated_passcode}{colors.ENDC}."
f" {colors.ENDL}The admin API key is not the Flask debugger PIN. You need to provide a config file for persistence!{colors.ENDL}")
print(f"{colors.WARNING}WARNING{colors.ENDC}: Default config populated with one-time, insecure pseudorandom admin API key: {colors.OKCYAN}{randomly_generated_passcode}{colors.ENDC}.\n"
f" The admin API key is not the Flask debugger PIN. You need to provide a config file for persistence!{colors.ENDL}")
dummy_config['api']['api_key_admin'] = randomly_generated_passcode
return dummy_config
@@ -54,5 +54,40 @@ def setConfig(configfile):
#setConfig(configfile)
config = {}
def getHeaders(caller="proxy"):
# NOTE: use ESR user-agent
# user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:130.0) Gecko/20100101 Firefox/130.0'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0'
if config[caller]['user-agent']:
user_agent = config[caller]['user-agent']
headers = {
'User-Agent': user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Sec-GPC': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Priority': 'u=0, i',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
return headers
def translateLinks(link):
link = link.replace("https://i.ytimg.com/", config['general']['public_facing_url'])
link = link.replace("https://yt3.ggpht.com/", config['general']['public_facing_url'] + "ggpht/")
return link
def getUptime():
return int(time.time()) - starttime