From 0099736a74d2897280f0db895bd28a3206d05ec3 Mon Sep 17 00:00:00 2001 From: sherl Date: Thu, 26 Dec 2024 20:15:45 +0100 Subject: [PATCH] new iOS/web extractors, image proxying done by views.py - ythdd_globals.py - added helper function to get user-configured header - ythdd.py - now checks for config.toml in work directory - requirements.txt - add brotli, so that requests can decompress innertube request --- config.default.toml | 14 ++-- requirements.txt | 3 +- views.py | 33 ++++++++- ythdd.py | 43 ++++++++++-- ythdd_api_v1.py | 2 +- ythdd_extractor.py | 158 ++++++++++++++++++++++++++++++++++++-------- ythdd_globals.py | 43 ++++++++++-- 7 files changed, 251 insertions(+), 45 deletions(-) diff --git a/config.default.toml b/config.default.toml index 1a66be2..216fc38 100644 --- a/config.default.toml +++ b/config.default.toml @@ -1,15 +1,19 @@ [general] -db_file_path = "/path/to/ythdd_db.sqlite" # Preferably stored on an SSD. -video_storage_directory_path = "/path/to/videos/" # Path to video vault. -is_proxied = false +db_file_path = "/path/to/ythdd_db.sqlite" # Preferably stored on an SSD. +video_storage_directory_path = "/path/to/videos/" # Path to video vault. +is_proxied = false # Set to true if running behind reverse proxy. +public_facing_url = "http://localhost:5000/" # Used for URL rewriting. Note the trailing backslash /. [api] api_key = "" # Leave empty API key for public access to non-sensitive backend api_key_admin = "CHANGEME" # Empty *admin* API key will autogenerate a random one every launch. [extractor] -user-agent = "" # leave empty for default -cookies_path = "" # leave empty for none +user-agent = "" # Leave empty for default (Firefox ESR). +cookies_path = "" # Leave empty for none. + +[proxy] +user-agent = "" # Leave empty for default (Firefox ESR). [admin] # List of users with admin priviledges. diff --git a/requirements.txt b/requirements.txt index bf5b261..191ff4f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ Flask-SQLAlchemy>=3.1.1 toml>=0.10.2 Flask-APScheduler>=1.13.1 requests>=2.32.3 -yt_dlp \ No newline at end of file +yt_dlp +brotli>=1.1.0 \ No newline at end of file diff --git a/views.py b/views.py index be71392..fecb393 100644 --- a/views.py +++ b/views.py @@ -1,8 +1,9 @@ #!/usr/bin/python3 -from flask import render_template +from flask import render_template, Response from flask_sqlalchemy import SQLAlchemy from markupsafe import escape import requests, json +import ythdd_globals def homepage(): return "homepage" @@ -11,4 +12,32 @@ def home(): return "welcome home!" def index(): - return "index" \ No newline at end of file + return "index" + +def thumbnailProxy(received_request): + + # apparently, this can be set to + # https://img.youtube.com/ as well + prefix = "https://i.ytimg.com/" + + if received_request.count("/") < 1 or received_request.index("/") != 11: + return Response(json.dumps({ + 'status': 400, + 'error_msg': 'invalid request. pretend this is a thumbnail :D' + }), mimetype='application/json', status=400) + + thumbnail = requests.get(prefix + "vi/" + received_request, headers=ythdd_globals.getHeaders(caller='proxy'), stream=True) + thumbnail.raw.decode_content = True + response = Response(thumbnail.raw, mimetype=thumbnail.headers['content-type'], status=thumbnail.status_code) + + return response + +def ggphtProxy(received_request): + + prefix = "https://yt3.ggpht.com/" + + ggpht = requests.get(prefix + received_request, headers=ythdd_globals.getHeaders(caller='proxy'), stream=True) + ggpht.raw.decode_content = True + response = Response(ggpht.raw, mimetype=ggpht.headers['content-type'], status=ggpht.status_code) + + return response diff --git a/ythdd.py b/ythdd.py index 64c857f..bd0426f 100644 --- a/ythdd.py +++ b/ythdd.py @@ -6,15 +6,18 @@ from argparse import ArgumentParser from ythdd_globals import colors import requests, json, toml, time import views, downloader, ythdd_api, ythdd_globals, ythdd_db +import os from flask_apscheduler import APScheduler -app = Flask(__name__) +app = Flask(__name__) +app_host = "None" +app_port = "None" def setup(): # sanity check: make sure config is set # required to make `flask --app ythdd run --debug` work - global config + global config, app_host, app_port try: if not config['general']: ythdd_globals.setConfig(ythdd_globals.configfile) @@ -31,6 +34,25 @@ def setup(): ythdd_globals.isProxied = config['general']['is_proxied'] ythdd_globals.outsideApiHits = 0 + are_we_sure_of_host_and_port = True + if app_host == "None": + app_host = "127.0.0.1" + are_we_sure_of_host_and_port = False + if app_port == "None": + app_port = "5000" + are_we_sure_of_host_and_port = False + + public_facing_url = config['general']['public_facing_url'] + rewrite_sanity_check = public_facing_url.replace(f"{app_host}:{app_port}", "") + if not config['general']['is_proxied'] and public_facing_url == rewrite_sanity_check: + sanity_string = f"{colors.WARNING}Heads up!{colors.ENDC} Public facing URL does not match the IP and port the server is running on.\n" + sanity_string += f" Expected: {colors.OKCYAN}{config['general']['public_facing_url']}{colors.ENDC}, but" + if not are_we_sure_of_host_and_port: sanity_string += " (assuming it's)" + sanity_string += f" running on: {colors.OKCYAN}{app_host}:{app_port}{colors.ENDC}.\n" + sanity_string += f" This is just a sanity check and may not neccessarily mean bad configuration.\n" + sanity_string += f" If you're running a reverse proxy, set {colors.OKCYAN}is_proxied{colors.ENDC} to true to silence this message.\n" + print(sanity_string) + app.config['SQLALCHEMY_DATABASE_URI'] = f"sqlite:///{config['general']['db_file_path']}" app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False app.add_url_rule('/', view_func=views.index) @@ -38,6 +60,8 @@ def setup(): app.add_url_rule('/home', view_func=views.home) app.add_url_rule('/api/', view_func=ythdd_api.api_greeting) app.add_url_rule('/api/', view_func=ythdd_api.api_global_catchall) + app.add_url_rule('/vi/', view_func=views.thumbnailProxy) + app.add_url_rule('/ggpht/', view_func=views.ggphtProxy) db = ythdd_db.initDB(app, config) with app.app_context(): @@ -81,15 +105,19 @@ def main(args): host = host_port[0] port = host_port[1] - global config + global config, app_host, app_port try: # if specified, use custom config file ythdd_globals.configfile = args.config ythdd_globals.setConfig(ythdd_globals.configfile) except: - # if not, use dummy file - ythdd_globals.configfile = "" + # if not, try using the default "config.toml" + if os.path.exists("config.toml"): + ythdd_globals.configfile = "config.toml" + else: + # unless it's not there, if that's the case then use the dummy file + ythdd_globals.configfile = "" # but try to set the API secret if provided by the user if args.secret: ythdd_globals.randomly_generated_passcode = args.secret @@ -97,6 +125,9 @@ def main(args): config = ythdd_globals.config + app_host = host + app_port = port + setup() app.run(host=host, port=int(port)) @@ -115,4 +146,6 @@ if __name__ == "__main__": main(args) else: + app_host = os.getenv("FLASK_RUN_HOST", "None") + app_port = os.getenv("FLASK_RUN_PORT", "None") setup() \ No newline at end of file diff --git a/ythdd_api_v1.py b/ythdd_api_v1.py index 1555a90..777fab7 100644 --- a/ythdd_api_v1.py +++ b/ythdd_api_v1.py @@ -113,7 +113,7 @@ def hot(data): started = time.time() try: # try to actually get the data - extracted_related = ythdd_extractor.related('https://www.youtube.com/watch?v=' + videoId) + extracted_related = ythdd_extractor.WEBrelated('https://www.youtube.com/watch?v=' + videoId) extracted_related['took'] = time.time() - started return 200, "OK", extracted_related except KeyError: diff --git a/ythdd_extractor.py b/ythdd_extractor.py index 642ddae..19c892a 100644 --- a/ythdd_extractor.py +++ b/ythdd_extractor.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -import yt_dlp, requests, json +import brotli, yt_dlp, requests, json, time import ythdd_globals ytdl_opts = { @@ -15,6 +15,89 @@ ytdl_opts = { "simulate": True } +stage1_headers = { + "Connection": "keep-alive", + "User-Agent": "com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-us,en;q=0.5", + "Sec-Fetch-Mode": "navigate", + "Content-Type": "application/json", + "X-Youtube-Client-Name": "5", + "X-Youtube-Client-Version": "19.45.4", + "Origin": "https://www.youtube.com", + "Accept-Encoding": "gzip, deflate, br", + "Cookie": "PREF=hl=en&tz=UTC; SOCS=CAI" +} + +stage1_body = { + "context": + { + "client": + { + "clientName": "IOS", + "clientVersion": "19.45.4", + "deviceMake": "Apple", + "deviceModel": "iPhone16,2", + "userAgent": "com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)", + "osName": "iPhone", + "osVersion": "18.1.0.22B83", + "hl": "en", + "timeZone": "UTC", + "utcOffsetMinutes": 0 + } + }, + #"videoId": uri, + "playbackContext": + { + "contentPlaybackContext": + { + "html5Preference": "HTML5_PREF_WANTS" + } + }, + "contentCheckOk": True, + "racyCheckOk": True +} + +stage2_headers = { + "Connection": "keep-alive", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-us,en;q=0.5", + "Sec-Fetch-Mode": "navigate", + "Accept-Encoding": "gzip, deflate, br" +} + +stage3_headers = { + "Connection": "keep-alive", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-us,en;q=0.5", + "Sec-Fetch-Mode": "navigate", + "Content-Type": "application/json", + "X-Youtube-Client-Name": "1", + "X-Youtube-Client-Version": "2.20241126.01.00", + "Origin": "https://www.youtube.com", + "Accept-Encoding": "gzip, deflate, br", + "Cookie": "PREF=hl=en&tz=UTC; SOCS=CAI" +} + +stage3_body = { + "context": + { + "client": + { + "clientName": "WEB", + "clientVersion": "2.20241126.01.00", + "hl": "en", + "timeZone": "UTC", + "utcOffsetMinutes": 0 + } + }, + #"videoId": uri, + "contentCheckOk": True, + "racyCheckOk": True +} + def extract(url: str, getcomments=False, maxcomments=""): # TODO: check user-agent and cookiefile @@ -34,7 +117,7 @@ def extract(url: str, getcomments=False, maxcomments=""): result = ytdl.extract_info(url, download=False) return result -def related(url: str): +def WEBrelated(url: str): # WARNING! HIGHLY EXPERIMENTAL, DUE TO BREAK ANYTIME if len(url) == 11: params = {'v': url} @@ -45,34 +128,55 @@ def related(url: str): videoId = url[32:44] params = {'v': videoId} - # NOTE: use ESR user-agent - # user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:130.0) Gecko/20100101 Firefox/130.0' - user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0' - - if ythdd_globals.config['extractor']['user-agent']: - user_agent = ythdd_globals.config['extractor']['user-agent'] - - headers = { - 'User-Agent': user_agent, - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'DNT': '1', - 'Sec-GPC': '1', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Priority': 'u=0, i', - 'Pragma': 'no-cache', - 'Cache-Control': 'no-cache', - } - response = requests.get(url, headers=headers, params=params) + response = requests.get(url, headers=ythdd_globals.getHeaders(caller='extractor'), params=params) extracted_string = str(response.content.decode('utf8', 'unicode_escape')) start = extracted_string.find('{"responseContext":{"serviceTrackingParams":') start2 = extracted_string.find('{"responseContext":{"serviceTrackingParams":', start + 1) end = extracted_string.find(';', start2) extracted_json = json.loads(extracted_string[start2:end]) - return extracted_json["contents"]['twoColumnWatchNextResults']["secondaryResults"] \ No newline at end of file + return extracted_json["contents"]['twoColumnWatchNextResults']["secondaryResults"] + +def WEBextractSinglePage(uri: str): + # WARNING! HIGHLY EXPERIMENTAL, DUE TO BREAK ANYTIME + + start_time = time.time() + + if len(uri) != 11: + raise ValueError("WEBextractSinglePage expects a single, 11-character long argument") + + response = requests.get("https://www.youtube.com/watch?v=" + uri, headers=ythdd_globals.getHeaders(caller='extractor')) + extracted_string = str(response.content.decode('utf8', 'unicode_escape')) + start = extracted_string.find('{"responseContext":{"serviceTrackingParams":') + end = extracted_string.find(';var ', start) + start2 = extracted_string.find('{"responseContext":{"serviceTrackingParams":', start + 1) + end2 = extracted_string.find(';', start2) + extracted_json1 = json.loads(extracted_string[start:end]) + extracted_json2 = json.loads(extracted_string[start2:end2]) + + end_time = time.time() + + return {'ec1': extracted_json1, 'ec2': extracted_json2, 'took': end_time - start_time} + +def IOSextract(uri: str): + + start = time.time() + + if len(uri) != 11: + raise ValueError("IOSextract expects a single, 11-character long uri as an argument") + + stage1_body['videoId'] = uri + stage1_h = requests.post("https://www.youtube.com/youtubei/v1/player?prettyPrint=false", headers=stage1_headers, json=stage1_body) + stage1 = json.loads(stage1_h.content.decode('utf-8')) + + #stage2_h = requests.get(stage1['streamingData']['hlsManifestUrl'], headers=stage2_headers) + #stage2 = stage2_h.content.decode('utf-8') + + stage3_body['videoId'] = uri + stage3_h = requests.post("https://www.youtube.com/youtubei/v1/next?prettyPrint=false", headers=stage3_headers, json=stage3_body) + stage3 = json.loads(stage3_h.content.decode('utf-8')) + + end = time.time() + + #return {'stage1': stage1, 'stage2': stage2, 'stage3': stage3, 'took': end - start} + return {'stage1': stage1, 'stage3': stage3, 'took': end - start} \ No newline at end of file diff --git a/ythdd_globals.py b/ythdd_globals.py index 2e12cc3..ed18f18 100644 --- a/ythdd_globals.py +++ b/ythdd_globals.py @@ -30,16 +30,16 @@ def getConfig(configfile): global randomly_generated_passcode if not os.path.exists(configfile): - dummy_config = {'general': {'db_file_path': 'ythdd_db.sqlite', 'video_storage_directory_path': 'videos/', 'is_proxied': False}, 'api': {'api_key': 'CHANGEME'}, 'extractor': {'user-agent': '', 'cookies_path': ''}, 'admin': {'admins': ['admin']}, 'yt_dlp': {}, 'postprocessing': {'presets': [{'name': 'recommended: [N][<=720p] best V+A', 'format': 'bv[height<=720]+ba', 'reencode': ''}, {'name': '[N][1080p] best V+A', 'format': 'bv[height=1080]+ba', 'reencode': ''}, {'name': '[R][1080p] webm', 'format': 'bv[height=1080]+ba', 'reencode': 'webm'}, {'name': '[N][720p] best V+A', 'format': 'bv[height=720]+ba', 'reencode': ''}, {'name': '[R][720p] webm', 'format': 'bv[height=720]+ba', 'reencode': 'webm'}, {'name': '[N][480p] best V+A', 'format': 'bv[height=480]+ba', 'reencode': ''}, {'name': '[480p] VP9 webm/reencode', 'format': 'bv*[height=480][ext=webm]+ba/bv[height=480]+ba', 'reencode': 'webm'}, {'name': '[N][1080p] best video only', 'format': 'bv[height=1080]', 'reencode': ''}, {'name': '[N][opus] best audio only', 'format': 'ba', 'reencode': 'opus'}]}} + dummy_config = {'general': {'db_file_path': 'ythdd_db.sqlite', 'video_storage_directory_path': 'videos/', 'is_proxied': False, 'public_facing_url': 'http://localhost:5000/'}, 'api': {'api_key': 'CHANGEME'}, 'extractor': {'user-agent': '', 'cookies_path': ''}, 'admin': {'admins': ['admin']}, 'yt_dlp': {}, 'postprocessing': {'presets': [{'name': 'recommended: [N][<=720p] best V+A', 'format': 'bv[height<=720]+ba', 'reencode': ''}, {'name': '[N][1080p] best V+A', 'format': 'bv[height=1080]+ba', 'reencode': ''}, {'name': '[R][1080p] webm', 'format': 'bv[height=1080]+ba', 'reencode': 'webm'}, {'name': '[N][720p] best V+A', 'format': 'bv[height=720]+ba', 'reencode': ''}, {'name': '[R][720p] webm', 'format': 'bv[height=720]+ba', 'reencode': 'webm'}, {'name': '[N][480p] best V+A', 'format': 'bv[height=480]+ba', 'reencode': ''}, {'name': '[480p] VP9 webm/reencode', 'format': 'bv*[height=480][ext=webm]+ba/bv[height=480]+ba', 'reencode': 'webm'}, {'name': '[N][1080p] best video only', 'format': 'bv[height=1080]', 'reencode': ''}, {'name': '[N][opus] best audio only', 'format': 'ba', 'reencode': 'opus'}]}} # if a passcode has not been provided by the user (config file doesn't exist, and user didn't specify it using an argument) print(f"{colors.WARNING}WARNING{colors.ENDC}: Using default, baked in config data. {colors.ENDL}" - f"Consider copying and editing the provided example file ({colors.OKCYAN}config.default.toml{colors.ENDC}).") + f" Consider copying and editing the provided example file ({colors.OKCYAN}config.default.toml{colors.ENDC}).") if randomly_generated_passcode == 0: # generate a pseudorandom one and use it in the temporary config randomly_generated_passcode = str(int(time.time() * 1337 % 899_999 + 100_000)) - print(f"{colors.WARNING}WARNING{colors.ENDC}: Default config populated with one-time, insecure pseudorandom admin API key: {colors.OKCYAN}{randomly_generated_passcode}{colors.ENDC}." - f" {colors.ENDL}The admin API key is not the Flask debugger PIN. You need to provide a config file for persistence!{colors.ENDL}") + print(f"{colors.WARNING}WARNING{colors.ENDC}: Default config populated with one-time, insecure pseudorandom admin API key: {colors.OKCYAN}{randomly_generated_passcode}{colors.ENDC}.\n" + f" The admin API key is not the Flask debugger PIN. You need to provide a config file for persistence!{colors.ENDL}") dummy_config['api']['api_key_admin'] = randomly_generated_passcode return dummy_config @@ -54,5 +54,40 @@ def setConfig(configfile): #setConfig(configfile) config = {} +def getHeaders(caller="proxy"): + + # NOTE: use ESR user-agent + # user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:130.0) Gecko/20100101 Firefox/130.0' + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0' + + if config[caller]['user-agent']: + user_agent = config[caller]['user-agent'] + + headers = { + 'User-Agent': user_agent, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'DNT': '1', + 'Sec-GPC': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Priority': 'u=0, i', + 'Pragma': 'no-cache', + 'Cache-Control': 'no-cache', + } + + return headers + +def translateLinks(link): + + link = link.replace("https://i.ytimg.com/", config['general']['public_facing_url']) + link = link.replace("https://yt3.ggpht.com/", config['general']['public_facing_url'] + "ggpht/") + + return link + def getUptime(): return int(time.time()) - starttime \ No newline at end of file