feat: video comments endpoint

2025-09-14 07:02:22 +02:00
parent d0d2298186
commit ef177f7200
4 changed files with 193 additions and 3 deletions
--- a/ythdd_extractor.py
+++ b/ythdd_extractor.py
@@ -435,3 +435,85 @@ def WEBgetSearchSuggestions(query: str, previous_query: str = '') -> list:
 		"query": query,
 		"suggestions": suggestions
 	}
 def WEBgetVideoComments(ctoken: str) -> tuple:
 	# ctoken needs to be passed explicitly.
 	# no guessing or retrieving it from globals.
 	if ctoken is None:
 		return [], ""
 	# build web context containing the relevant ctoken
 	web_context = makeWebContext({"continuation": ctoken})
 	response = requests.post('https://www.youtube.com/youtubei/v1/next', 
 		params={"prettyPrint": False}, 
 		headers=stage2_headers, 
 		data=json.dumps(web_context)
 	)
 	results = []
 	try:
 		results = json.loads(response.text)
 	except:
 		pass
 	comments = safeTraverse(results, ["frameworkUpdates", "entityBatchUpdate", "mutations"], default=[])
 	comment_continuations = []
 	comment_continuations_re = safeTraverse(results, ["onResponseReceivedEndpoints"], default=[])
 	for received_endpoint in comment_continuations_re:
 		# this is horrible...
 		acia = safeTraverse(received_endpoint, ["appendContinuationItemsAction", "continuationItems"], default=[])
 		rcic = safeTraverse(received_endpoint, ["reloadContinuationItemsCommand", "continuationItems"], default=[])
 		for entry in acia:
 			if "commentThreadRenderer" in entry or "continuationItemRenderer" in entry:
 				comment_continuations = acia
 				break
 		for entry in rcic:
 			if "commentThreadRenderer" in entry or "continuationItemRenderer" in entry:
 				comment_continuations = rcic
 				break
 		if comment_continuations != []:
 			break
 	if comment_continuations == []:
 		print("error: received an unknown comment structure, unable to parse continuations (replies)")
 		# breakpoint()
 		# return [], ""
 	# extract new continuation
 	new_continuation = ""
 	if "continuationItemRenderer" in safeTraverse(comment_continuations, [-1], default=[]):
 		# first, look for ctoken inside of response for next page of comments
 		new_continuation = safeTraverse(comment_continuations, [-1, "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token"], default=None)
 		# or search elsewhere in case this is a reply thread
 		if new_continuation is None:
 			new_continuation = safeTraverse(comment_continuations, [-1, "continuationItemRenderer", "button", "buttonRenderer", "command", "continuationCommand", "token"], default="")
 	# perform a basic mutation check before parsing
 	# will ignore replies liked by video uploader ("hearts")
 	actual_comments = [x for x in comments if "properties" in safeTraverse(x, ["payload", "commentEntityPayload"], default=[], quiet=True)]
 	actual_comment_continuations = [x for x in comment_continuations if "replies" in safeTraverse(x, ["commentThreadRenderer"], default=[], quiet=True)]
 	# link reply data (reply count and ctoken) for comments with replies
 	for reply_renderer in actual_comment_continuations:
 		mutual_key   = safeTraverse(reply_renderer, ["commentThreadRenderer", "commentViewModel", "commentViewModel", "commentKey"], default="unknown-key")
 		reply_ctoken = safeTraverse(reply_renderer, ["commentThreadRenderer", "replies", "commentRepliesRenderer", "contents", 0, "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token"], default="")
 		reply_count  = safeTraverse(reply_renderer, ["commentThreadRenderer", "replies", "commentRepliesRenderer", "viewReplies", "buttonRenderer", "text", "runs", 0, "text"], default="0 replies").split(" ")[0]
 		for comment in actual_comments:
 			found_key = safeTraverse(comment, ["entityKey"], default="unknown-key")
 			# try to link a relevant ctoken if a comment has response
 			if found_key == mutual_key:
 				if ythdd_globals.config["general"]["debug"]: print(f"found reply for {found_key}")
 				comment["replies"] = {
 					"replyCount": int(reply_count),
 					"continuation": reply_ctoken
 				}
 	return actual_comments, new_continuation
--- a/ythdd_globals.py
+++ b/ythdd_globals.py
@@ -23,7 +23,7 @@ version = "0.0.1"
 apiVersion = "1"
 randomly_generated_passcode = 0
 video_cache = {}
-general_cache = {"search": [], "continuations": {"channels": {}}, "channels": {}}
+general_cache = {"search": [], "continuations": {"channels": {}, "comments": {}}, "channels": {}}
 def getConfig(configfile):
--- a/ythdd_inv_tl.py
+++ b/ythdd_inv_tl.py
@@ -449,6 +449,12 @@ def videos(data):
 		premium = True
 	# TODO: detect paywalled patron-only videos
 	# because we fetched the video's wdata, we might as
 	# well save it inside of general cache so that
 	# requests for the video's comments don't have to
 	# spawn an additional request for initial ctoken
 	ensure_comment_continuation(video_id, wdata)
 	time_end = time()
 	response = {
@@ -680,10 +686,70 @@ def get_channel_tab(requested_tab, ucid, req, only_json: bool = False):
 	return send(200, response)
 def get_comments(data, req, only_json: bool = False):
 	# get comment continuation
 	ctoken = req.args.get('continuation')
 	# perform some basic video id validation
 	if len(data) < 4 or len(data) >= 4 and len(data[3]) != 11:
 		return send(400, {"error": "Bad request: invalid videoId."})
 	video_id = data[3]
 	# if ctoken isn't provided, get it from the general cache
 	if ctoken is None or ctoken == '':
 		# but first ensure it's there
 		ensure_comment_continuation(video_id)
 		ctoken = ythdd_globals.general_cache["continuations"]["comments"][video_id][0]
 	# get joined video comment models
 	wdata, new_continuation = ythdd_extractor.WEBgetVideoComments(ctoken)
 	comments = []
 	for comment in wdata:
 		# parse the comment
 		parsed_comment = ythdd_struct_parser.customCommentRendererParser(comment)
 		if parsed_comment is not None:
 			comments.append(parsed_comment)
 	response = {
 		"videoId": video_id,
 		"comments": comments,
 		"continuation": new_continuation
 	}
 	if only_json:
 		return response
 	return send(200, response)
 def ensure_comment_continuation(video_id: str, wdata = None):
 	# save continutation token for comments in global comment cache
 	if not video_id in ythdd_globals.general_cache["continuations"]["comments"]:
 		ythdd_globals.general_cache["continuations"]["comments"][video_id] = []
 		if wdata is None:
 			# perhaps saving related videos to cache might be not a bad idea?
 			wdata = ythdd_extractor.WEBextractSinglePage(video_id)
 		# search for "top comments" continuation token
 		comment_continuation = safeTraverse(wdata, ["ec2", "engagementPanels", 0, "engagementPanelSectionListRenderer", "header", "engagementPanelTitleHeaderRenderer", "menu", "sortFilterSubMenuRenderer", "subMenuItems", 0, "serviceEndpoint", "continuationCommand", "token"], default=None)
 		if comment_continuation is not None:
 			ythdd_globals.general_cache["continuations"]["comments"][video_id].append(comment_continuation)
 		else:
 			print(f"error: couldn't extract comment continuation token from video page ({video_id})")
 def channels(data, req, only_json: bool = False):
-	if len(data[3]) != 24 or not data[3].startswith("UC"):
+	# prevent potential out of bound read
 	if len(data) < 4:
 		return send(400, {"error": "No channel specified."})
 	# silly sanity check
 	if len(data[3]) != 24 or not data[3].startswith("UC"):
 		return send(404, {"error": "This channel does not exist."})
 	if len(data) > 4:
@@ -775,6 +841,8 @@ def lookup(data, req):
 					return search(data, req)
 				case 'channels':
 					return channels(data, req)
 				case 'comments':
 					return get_comments(data, req)
 				case _:
 					incrementBadRequests()
 					return notImplemented(data)
--- a/ythdd_struct_parser.py
+++ b/ythdd_struct_parser.py
@@ -304,6 +304,46 @@ def parseRenderers(entry: dict, context: dict = {}) -> dict:
 			# breakpoint()
 			return
 def customCommentRendererParser(comment: dict, context: dict = {}) -> dict:
 	cep            = safeTraverse(comment, ["payload", "commentEntityPayload"], default={})
 	content        = safeTraverse(cep, ["properties", "content", "content"], default="")
 	content_html   = escape(content).replace("\r\n", "<br>").replace("\n", "<br>")
 	author         = safeTraverse(cep, ["author"], default={})
 	verified       = safeTraverse(author, ["isVerified"], default=False) or safeTraverse(author, ["isArtist"], default=False)
 	ucid           = safeTraverse(author, ["channelId"], default="UNKNOWNCHANNELID")
 	published_date = safeTraverse(cep, ["properties", "publishedTime"], default="now")
 	edited         = False
 	if published_date.endswith(" (edited)"):
 		edited = True
 		published_date_unix = int(dateparser.parse(published_date.removesuffix(" (edited)")).timestamp())
 	else:
 		published_date_unix = int(dateparser.parse(published_date).timestamp())
 	inv_comment = {
 		"authorId": ucid,
 		"authorUrl": "/channel/" + ucid,
 		"author": safeTraverse(author, ["displayName"], default="@ythdd-unknown-user"),
 		"verified": verified,
 		"authorThumbnails": ythdd_extractor.generateChannelAvatarsFromUrl(safeTraverse(author, ["avatarThumbnailUrl"], default=DEFAULT_AVATAR)), # proxy them!
 		"authorIsChannelOwner": safeTraverse(author, ["isCreator"], default=False), # ???
 		"isSponsor": False, # not sure how to retrieve this
 		"likeCount": parseViewsFromViewText("0" + safeTraverse(cep, ["toolbar", "likeCountNotliked"], default="0") + " likes"),
 		"isPinned": False,
 		"commentId": safeTraverse(cep, ["properties", "commentId"], default="UNKNOWNCOMMENTID"),
 		"content": content,
 		"contentHtml": content_html,
 		"isEdited": edited,
 		"published": published_date_unix,
 		"publishedText": published_date if published_date != "now" else "unknown amount of time ago"
 	}
 	if "replies" in comment:
 		inv_comment["replies"] = comment["replies"]
 	return inv_comment
 def parseDescriptionSnippet(snippet: list):
 	text = ""