ythdd/ythdd_extractor.py

#!/usr/bin/python3
import brotli, yt_dlp, requests, json, time
from ythdd_globals import safeTraverse
import ythdd_globals

ytdl_opts = {
	#"format": "bv*[height<=720]+ba", # to be defined by the user
	#"getcomments": True,
	#"extractor_args": {"maxcomments": ...},
	#"writeinfojson": True,
	#"progress_hooks": my_hook,
	"outtmpl": {
		"default": "%(id)s.%(ext)s",
		"chapter": "%(id)s.%(ext)s_%(section_number)03d_%(section_title)s.%(ext)s"
	},
	"extractor_args": {
		"youtube": {
				# "formats": ["dashy"]
			}
	},
	"simulate": True
}

stage1_headers = {
	"Connection": "keep-alive",
	"User-Agent": "com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
	"Accept-Language": "en-us,en;q=0.5",
	"Sec-Fetch-Mode": "navigate",
	"Content-Type": "application/json",
	"X-Youtube-Client-Name": "5",
	"X-Youtube-Client-Version": "19.45.4",
	"Origin": "https://www.youtube.com",
	"Accept-Encoding": "gzip, deflate, br",
	"Cookie": "PREF=hl=en&tz=UTC; SOCS=CAI"
}

stage1_body = {
	"context":
	{
		"client":
		{
			"clientName": "IOS",
			"clientVersion": "19.45.4",
			"deviceMake": "Apple",
			"deviceModel": "iPhone16,2",
			"userAgent": "com.google.ios.youtube/19.45.4 (iPhone16,2; U; CPU iOS 18_1_0 like Mac OS X;)",
			"osName": "iPhone",
			"osVersion": "18.1.0.22B83",
			"hl": "en",
			"timeZone": "UTC",
			"utcOffsetMinutes": 0
		}
	},
	#"videoId": uri,
	"playbackContext":
	{
		"contentPlaybackContext":
		{
			"html5Preference": "HTML5_PREF_WANTS"
		}
	},
	"contentCheckOk": True,
	"racyCheckOk": True
}

stage2_headers = {
	"Connection": "keep-alive",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
	"Accept-Language": "en-us,en;q=0.5",
	"Sec-Fetch-Mode": "navigate",
	"Accept-Encoding": "gzip, deflate, br"
}

stage3_headers = {
	"Connection": "keep-alive",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
	"Accept-Language": "en-us,en;q=0.5",
	"Sec-Fetch-Mode": "navigate",
	"Content-Type": "application/json",
	"X-Youtube-Client-Name": "1",
	"X-Youtube-Client-Version": "2.20250829.01.00",
	"Origin": "https://www.youtube.com",
	"Accept-Encoding": "gzip, deflate, br",
	"Cookie": "PREF=hl=en&tz=UTC; SOCS=CAI"
}

stage3_body = {
	"context":
	{
		"client":
		{
			"clientName": "WEB",
			"clientVersion": "2.20250829.01.00",
			"hl": "en",
			"timeZone": "UTC",
			"utcOffsetMinutes": 0
		}
	},
	#"videoId": uri,
	"contentCheckOk": True,
	"racyCheckOk": True
}

web_context_dict = {
    'context': {
        'client': {
            'hl': 'en',
            'gl': 'US',
            'deviceMake': '',
            'deviceModel': '',
            'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0,gzip(gfe)',
            'clientName': 'WEB',
            'clientVersion': '2.20250829.01.00',
            'osName': 'Windows',
            'osVersion': '10.0',
            'screenPixelDensity': 2,
            'platform': 'DESKTOP',
            'screenDensityFloat': 2,
            'userInterfaceTheme': 'USER_INTERFACE_THEME_LIGHT',
            'browserName': 'Firefox',
            'browserVersion': '142.0',
            'acceptHeader': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'utcOffsetMinutes': 0,
        }
    }
}

def extract(url: str, getcomments=False, maxcomments="", manifest_fix=False):
	# TODO: check user-agent and cookiefile

	if ythdd_globals.config['extractor']['user-agent']:
		yt_dlp.utils.std_headers['User-Agent'] = ythdd_globals.config['extractor']['user-agent']

	if ythdd_globals.config['extractor']['cookies_path']:
		ytdl_opts['cookiefile'] = ythdd_globals.config['extractor']['cookies_path']

	if len(url) == 11:
		url = "https://www.youtube.com/watch?v=" + url
	if getcomments:
		ytdl_opts['getcomments'] = True
	if maxcomments:
		ytdl_opts['extractor_args']['youtube']['max_comments'] = [maxcomments, "all", "all", "all"]
	if manifest_fix:
		# https://github.com/yt-dlp/yt-dlp/issues/11952#issuecomment-2565802294
		ytdl_opts['extractor_args']['youtube']['player_client'] = ['default', 'web_safari']
	with yt_dlp.YoutubeDL(ytdl_opts) as ytdl:
		result = ytdl.sanitize_info(ytdl.extract_info(url, download=False))
	return result

def WEBrelated(url: str):
	# WARNING! HIGHLY EXPERIMENTAL, DUE TO BREAK ANYTIME
	if len(url) == 11:
		params = {'v': url}
	else:
		videoId = url.find("https://www.youtube.com/watch?v=") # len() = 32
		if videoId == -1:
			raise BaseException
		videoId = url[32:44]
		params = {'v': videoId}

	response = requests.get(url, headers=ythdd_globals.getHeaders(caller='extractor'), params=params)
	extracted_string = str(response.content.decode('utf8', 'unicode_escape'))
	start = extracted_string.find('{"responseContext":{"serviceTrackingParams":')
	start2 = extracted_string.find('{"responseContext":{"serviceTrackingParams":', start + 1)
	end = extracted_string.find(';</script>', start2)
	extracted_json = json.loads(extracted_string[start2:end])

	return extracted_json["contents"]['twoColumnWatchNextResults']["secondaryResults"]

def WEBextractSinglePage(uri: str):
	# WARNING! HIGHLY EXPERIMENTAL, DUE TO BREAK ANYTIME

	start_time = time.time()

	if len(uri) != 11:
		raise ValueError("WEBextractSinglePage expects a single, 11-character long argument")

	response = requests.get("https://www.youtube.com/watch?v=" + uri, headers=ythdd_globals.getHeaders(caller='extractor'))
	extracted_string = str(response.content.decode('utf8', 'unicode_escape'))
	start = extracted_string.find('{"responseContext":{"serviceTrackingParams":')
	end = extracted_string.find(';var ', start)
	start2 = extracted_string.find('{"responseContext":{"serviceTrackingParams":', start + 1)
	end2 = extracted_string.find(';</script>', start2)
	extracted_json1 = json.loads(extracted_string[start:end])
	extracted_json2 = json.loads(extracted_string[start2:end2])

	end_time = time.time()

	return {'ec1': extracted_json1, 'ec2': extracted_json2, 'took': end_time - start_time}

def paramsFromUrl(url: str) -> dict:
	# Returns a dictionary of params from a given URL.
	split_list = url.split("&")
	params = {}

	for num, string in enumerate(split_list):
		if num == 0:
			string  = string[string.find("?") + 1:]
		key, value  = string.split("=")
		params[key] = value

	return params

def IOSextract(uri: str):

	start = time.time()

	if len(uri) != 11:
		raise ValueError("IOSextract expects a single, 11-character long uri as an argument")

	stage1_body['videoId'] = uri
	stage1_h = requests.post("https://www.youtube.com/youtubei/v1/player?prettyPrint=false", headers=stage1_headers, json=stage1_body)
	stage1   = json.loads(stage1_h.content.decode('utf-8'))

	#stage2_h = requests.get(stage1['streamingData']['hlsManifestUrl'], headers=stage2_headers)
	#stage2   = stage2_h.content.decode('utf-8')

	stage3_body['videoId'] = uri
	stage3_h = requests.post("https://www.youtube.com/youtubei/v1/next?prettyPrint=false", headers=stage3_headers, json=stage3_body)
	stage3   = json.loads(stage3_h.content.decode('utf-8'))

	end = time.time()

	#return {'stage1': stage1, 'stage2': stage2, 'stage3': stage3, 'took': end - start}
	return {'stage1': stage1, 'stage3': stage3, 'took': end - start}

def makeWebContext(secondaryContextDict: dict):
	# Uses web_context_dict to create a context, returns a dict.

	current_web_context_dict = web_context_dict

	for key in secondaryContextDict:
		current_web_context_dict[key] = secondaryContextDict[key]

	return current_web_context_dict

def getChannelAvatar(response_json: dict):
	# Returns a dictionary: {url: <proxied url to remote server>, width: ..., height: ...}
	# containing the best resolution in terms of pixel count.
	# A great majority of the code has been influenced by https://github.com/iv-org/invidious/blob/master/src/invidious/channels/about.cr.

	avatars = safeTraverse(response_json, ['metadata', 'channelMetadataRenderer', 'avatar', 'thumbnails'], default=None)

	if avatars is None:
		# fallback to lower resolution avatars
		avatars = safeTraverse(response_json, ['header',
			'pageHeaderRenderer',
			'content',
			'pageHeaderViewModel',
			'image',
			'decoratedAvatarViewModel',
			'avatar',
			'avatarViewModel',
			'image',
			'sources'], default=None)

	# if avatars is None: # TODO: if avatars is still None, use a local avatar

	best_avatar = avatars[-1] # usually, the best avatar is stored last
	for avatar in avatars:
		if avatar['width'] * avatar['height'] > best_avatar['width'] * best_avatar['height']:
			best_avatar = avatar

	best_avatar['url'] = ythdd_globals.translateLinks(best_avatar['url'])

	return best_avatar

def isVerified(response_json: dict):
	# Returns True if any user badge has been found (verified/artist).
	badges = safeTraverse(response_json, [], default=False)

	if badges: return True
	return False

def browseAbout(ucid: str):
	# Returns the response from innertubes browse endpoint for channels (as a dict).

	if len(ucid) != 24:
		raise ValueError(f"Something is wrong with the UCID {ucid}. Expected a 24-character long channel ID, not {len(ucid)}.")

	context = makeWebContext({'browseId': ucid})

	response = requests.post(
		'https://www.youtube.com/youtubei/v1/browse?prettyPrint=false',
		headers = ythdd_globals.getHeaders(),
		json    = context,
	)

	response_json = json.loads(response.text)

	return response_json