feat: video comments endpoint

2025-09-14 07:02:22 +02:00
parent d0d2298186
commit ef177f7200
4 changed files with 193 additions and 3 deletions
--- a/ythdd_extractor.py
+++ b/ythdd_extractor.py
@@ -434,4 +434,86 @@ def WEBgetSearchSuggestions(query: str, previous_query: str = '') -> list:
 	return {
 		"query": query,
 		"suggestions": suggestions
-	}
+	}
+
+def WEBgetVideoComments(ctoken: str) -> tuple:
+
+	# ctoken needs to be passed explicitly.
+	# no guessing or retrieving it from globals.
+	if ctoken is None:
+		return [], ""
+
+	# build web context containing the relevant ctoken
+	web_context = makeWebContext({"continuation": ctoken})
+	response = requests.post('https://www.youtube.com/youtubei/v1/next', 
+		params={"prettyPrint": False}, 
+		headers=stage2_headers, 
+		data=json.dumps(web_context)
+	)
+
+	results = []
+	try:
+		results = json.loads(response.text)
+	except:
+		pass
+
+	comments = safeTraverse(results, ["frameworkUpdates", "entityBatchUpdate", "mutations"], default=[])
+	comment_continuations = []
+	comment_continuations_re = safeTraverse(results, ["onResponseReceivedEndpoints"], default=[])
+	for received_endpoint in comment_continuations_re:
+
+		# this is horrible...
+
+		acia = safeTraverse(received_endpoint, ["appendContinuationItemsAction", "continuationItems"], default=[])
+		rcic = safeTraverse(received_endpoint, ["reloadContinuationItemsCommand", "continuationItems"], default=[])
+
+		for entry in acia:
+			if "commentThreadRenderer" in entry or "continuationItemRenderer" in entry:
+				comment_continuations = acia
+				break
+
+		for entry in rcic:
+			if "commentThreadRenderer" in entry or "continuationItemRenderer" in entry:
+				comment_continuations = rcic
+				break
+
+		if comment_continuations != []:
+			break
+
+	if comment_continuations == []:
+		print("error: received an unknown comment structure, unable to parse continuations (replies)")
+		# breakpoint()
+		# return [], ""
+
+	# extract new continuation
+	new_continuation = ""
+	if "continuationItemRenderer" in safeTraverse(comment_continuations, [-1], default=[]):
+		# first, look for ctoken inside of response for next page of comments
+		new_continuation = safeTraverse(comment_continuations, [-1, "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token"], default=None)
+		# or search elsewhere in case this is a reply thread
+		if new_continuation is None:
+			new_continuation = safeTraverse(comment_continuations, [-1, "continuationItemRenderer", "button", "buttonRenderer", "command", "continuationCommand", "token"], default="")
+
+	# perform a basic mutation check before parsing
+	# will ignore replies liked by video uploader ("hearts")
+	actual_comments = [x for x in comments if "properties" in safeTraverse(x, ["payload", "commentEntityPayload"], default=[], quiet=True)]
+	actual_comment_continuations = [x for x in comment_continuations if "replies" in safeTraverse(x, ["commentThreadRenderer"], default=[], quiet=True)]
+
+	# link reply data (reply count and ctoken) for comments with replies
+	for reply_renderer in actual_comment_continuations:
+
+		mutual_key   = safeTraverse(reply_renderer, ["commentThreadRenderer", "commentViewModel", "commentViewModel", "commentKey"], default="unknown-key")
+		reply_ctoken = safeTraverse(reply_renderer, ["commentThreadRenderer", "replies", "commentRepliesRenderer", "contents", 0, "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token"], default="")
+		reply_count  = safeTraverse(reply_renderer, ["commentThreadRenderer", "replies", "commentRepliesRenderer", "viewReplies", "buttonRenderer", "text", "runs", 0, "text"], default="0 replies").split(" ")[0]
+
+		for comment in actual_comments:
+			found_key = safeTraverse(comment, ["entityKey"], default="unknown-key")
+			# try to link a relevant ctoken if a comment has response
+			if found_key == mutual_key:
+				if ythdd_globals.config["general"]["debug"]: print(f"found reply for {found_key}")
+				comment["replies"] = {
+					"replyCount": int(reply_count),
+					"continuation": reply_ctoken
+				}
+
+	return actual_comments, new_continuation
--- a/ythdd_globals.py
+++ b/ythdd_globals.py
@@ -23,7 +23,7 @@ version = "0.0.1"
 apiVersion = "1"
 randomly_generated_passcode = 0
 video_cache = {}
-general_cache = {"search": [], "continuations": {"channels": {}}, "channels": {}}
+general_cache = {"search": [], "continuations": {"channels": {}, "comments": {}}, "channels": {}}

 def getConfig(configfile):
 	
--- a/ythdd_inv_tl.py
+++ b/ythdd_inv_tl.py
@@ -449,6 +449,12 @@ def videos(data):
 		premium = True
 	# TODO: detect paywalled patron-only videos

+	# because we fetched the video's wdata, we might as
+	# well save it inside of general cache so that
+	# requests for the video's comments don't have to
+	# spawn an additional request for initial ctoken
+	ensure_comment_continuation(video_id, wdata)
+
 	time_end = time()

 	response = {
@@ -680,10 +686,70 @@ def get_channel_tab(requested_tab, ucid, req, only_json: bool = False):

 	return send(200, response)

+def get_comments(data, req, only_json: bool = False):
+
+	# get comment continuation
+	ctoken = req.args.get('continuation')
+
+	# perform some basic video id validation
+	if len(data) < 4 or len(data) >= 4 and len(data[3]) != 11:
+		return send(400, {"error": "Bad request: invalid videoId."})
+
+	video_id = data[3]
+
+	# if ctoken isn't provided, get it from the general cache
+	if ctoken is None or ctoken == '':
+		# but first ensure it's there
+		ensure_comment_continuation(video_id)
+		ctoken = ythdd_globals.general_cache["continuations"]["comments"][video_id][0]
+
+	# get joined video comment models
+	wdata, new_continuation = ythdd_extractor.WEBgetVideoComments(ctoken)
+
+	comments = []
+	for comment in wdata:
+		# parse the comment
+		parsed_comment = ythdd_struct_parser.customCommentRendererParser(comment)
+		if parsed_comment is not None:
+			comments.append(parsed_comment)
+
+	response = {
+		"videoId": video_id,
+		"comments": comments,
+		"continuation": new_continuation
+	}
+
+	if only_json:
+		return response
+
+	return send(200, response)
+
+
+def ensure_comment_continuation(video_id: str, wdata = None):
+
+	# save continutation token for comments in global comment cache
+	if not video_id in ythdd_globals.general_cache["continuations"]["comments"]:
+		ythdd_globals.general_cache["continuations"]["comments"][video_id] = []
+
+		if wdata is None:
+			# perhaps saving related videos to cache might be not a bad idea?
+			wdata = ythdd_extractor.WEBextractSinglePage(video_id)
+
+		# search for "top comments" continuation token
+		comment_continuation = safeTraverse(wdata, ["ec2", "engagementPanels", 0, "engagementPanelSectionListRenderer", "header", "engagementPanelTitleHeaderRenderer", "menu", "sortFilterSubMenuRenderer", "subMenuItems", 0, "serviceEndpoint", "continuationCommand", "token"], default=None)
+		if comment_continuation is not None:
+			ythdd_globals.general_cache["continuations"]["comments"][video_id].append(comment_continuation)
+		else:
+			print(f"error: couldn't extract comment continuation token from video page ({video_id})")
+
 def channels(data, req, only_json: bool = False):

+	# prevent potential out of bound read
+	if len(data) < 4:
+		return send(400, {"error": "No channel specified."})
+
+	# silly sanity check
 	if len(data[3]) != 24 or not data[3].startswith("UC"):
-		# silly sanity check
 		return send(404, {"error": "This channel does not exist."})

 	if len(data) > 4:
@@ -775,6 +841,8 @@ def lookup(data, req):
 					return search(data, req)
 				case 'channels':
 					return channels(data, req)
+				case 'comments':
+					return get_comments(data, req)
 				case _:
 					incrementBadRequests()
 					return notImplemented(data)
--- a/ythdd_struct_parser.py
+++ b/ythdd_struct_parser.py
@@ -304,6 +304,46 @@ def parseRenderers(entry: dict, context: dict = {}) -> dict:
 			# breakpoint()
 			return

+def customCommentRendererParser(comment: dict, context: dict = {}) -> dict:
+
+	cep            = safeTraverse(comment, ["payload", "commentEntityPayload"], default={})
+	content        = safeTraverse(cep, ["properties", "content", "content"], default="")
+	content_html   = escape(content).replace("\r\n", "<br>").replace("\n", "<br>")
+	author         = safeTraverse(cep, ["author"], default={})
+	verified       = safeTraverse(author, ["isVerified"], default=False) or safeTraverse(author, ["isArtist"], default=False)
+	ucid           = safeTraverse(author, ["channelId"], default="UNKNOWNCHANNELID")
+	published_date = safeTraverse(cep, ["properties", "publishedTime"], default="now")
+	edited         = False
+
+	if published_date.endswith(" (edited)"):
+		edited = True
+		published_date_unix = int(dateparser.parse(published_date.removesuffix(" (edited)")).timestamp())
+	else:
+		published_date_unix = int(dateparser.parse(published_date).timestamp())
+
+	inv_comment = {
+		"authorId": ucid,
+		"authorUrl": "/channel/" + ucid,
+		"author": safeTraverse(author, ["displayName"], default="@ythdd-unknown-user"),
+		"verified": verified,
+		"authorThumbnails": ythdd_extractor.generateChannelAvatarsFromUrl(safeTraverse(author, ["avatarThumbnailUrl"], default=DEFAULT_AVATAR)), # proxy them!
+		"authorIsChannelOwner": safeTraverse(author, ["isCreator"], default=False), # ???
+		"isSponsor": False, # not sure how to retrieve this
+		"likeCount": parseViewsFromViewText("0" + safeTraverse(cep, ["toolbar", "likeCountNotliked"], default="0") + " likes"),
+		"isPinned": False,
+		"commentId": safeTraverse(cep, ["properties", "commentId"], default="UNKNOWNCOMMENTID"),
+		"content": content,
+		"contentHtml": content_html,
+		"isEdited": edited,
+		"published": published_date_unix,
+		"publishedText": published_date if published_date != "now" else "unknown amount of time ago"
+	}
+
+	if "replies" in comment:
+		inv_comment["replies"] = comment["replies"]
+
+	return inv_comment
+
 def parseDescriptionSnippet(snippet: list):

 	text = ""