ythdd/ythdd_extractor.py

#!/usr/bin/python3
import yt_dlp, requests, json
import ythdd_globals

ytdl_opts = {
	#"format": "bv*[height<=720]+ba", # to be defined by the user
	#"getcomments": True,
	#"extractor_args": {"maxcomments": ...},
	#"writeinfojson": True,
	#"progress_hooks": my_hook,
	"outtmpl": {
		"default": "%(id)s.%(ext)s",
		"chapter": "%(id)s.%(ext)s_%(section_number)03d_%(section_title)s.%(ext)s"
	},
	"simulate": True
}

def extract(url: str, getcomments=False, maxcomments=""):
	# TODO: check user-agent and cookiefile

	if ythdd_globals.config['extractor']['user-agent']:
		yt_dlp.utils.std_headers['User-Agent'] = ythdd_globals.config['extractor']['user-agent']

	if ythdd_globals.config['extractor']['cookies_path']:
		ytdl_opts['cookiefile'] = ythdd_globals.config['extractor']['cookies_path']

	if len(url) == 11:
		url = "https://www.youtube.com/watch?v=" + url
	if getcomments:
		ytdl_opts['getcomments'] = True
	if maxcomments:
		ytdl_opts['extractor_args'] = {'youtube': {'max_comments': [maxcomments, "all", "all", "all"]}}
	with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
		result = ytdl.extract_info(url, download=False)
	return result

def related(url: str):
	# WARNING! HIGHLY EXPERIMENTAL, DUE TO BREAK ANYTIME
	if len(url) == 11:
		params = {'v': url}
	else:
		videoId = url.find("https://www.youtube.com/watch?v=") # len() = 32
		if videoId == -1:
			raise BaseException
		videoId = url[32:44]
		params = {'v': videoId}

	# NOTE: use ESR user-agent
	# user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:130.0) Gecko/20100101 Firefox/130.0'
	user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0'

	if ythdd_globals.config['extractor']['user-agent']:
		user_agent = ythdd_globals.config['extractor']['user-agent']

	headers = {
		'User-Agent': user_agent,
		'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8',
		'Accept-Language': 'en-US,en;q=0.5',
		'DNT': '1',
		'Sec-GPC': '1',
		'Connection': 'keep-alive',
		'Upgrade-Insecure-Requests': '1',
		'Sec-Fetch-Dest': 'document',
		'Sec-Fetch-Mode': 'navigate',
		'Sec-Fetch-Site': 'none',
		'Sec-Fetch-User': '?1',
		'Priority': 'u=0, i',
		'Pragma': 'no-cache',
		'Cache-Control': 'no-cache',
	}
	response = requests.get(url, headers=headers, params=params)
	extracted_string = str(response.content.decode('utf8', 'unicode_escape'))
	start = extracted_string.find('{"responseContext":{"serviceTrackingParams":')
	start2 = extracted_string.find('{"responseContext":{"serviceTrackingParams":', start + 1)
	end = extracted_string.find(';</script>', start2)
	extracted_json = json.loads(extracted_string[start2:end])

	return extracted_json["contents"]['twoColumnWatchNextResults']["secondaryResults"]