diff --git a/ythdd_extractor.py b/ythdd_extractor.py index 177b4dd..05c28fe 100644 --- a/ythdd_extractor.py +++ b/ythdd_extractor.py @@ -434,4 +434,86 @@ def WEBgetSearchSuggestions(query: str, previous_query: str = '') -> list: return { "query": query, "suggestions": suggestions - } \ No newline at end of file + } + +def WEBgetVideoComments(ctoken: str) -> tuple: + + # ctoken needs to be passed explicitly. + # no guessing or retrieving it from globals. + if ctoken is None: + return [], "" + + # build web context containing the relevant ctoken + web_context = makeWebContext({"continuation": ctoken}) + response = requests.post('https://www.youtube.com/youtubei/v1/next', + params={"prettyPrint": False}, + headers=stage2_headers, + data=json.dumps(web_context) + ) + + results = [] + try: + results = json.loads(response.text) + except: + pass + + comments = safeTraverse(results, ["frameworkUpdates", "entityBatchUpdate", "mutations"], default=[]) + comment_continuations = [] + comment_continuations_re = safeTraverse(results, ["onResponseReceivedEndpoints"], default=[]) + for received_endpoint in comment_continuations_re: + + # this is horrible... + + acia = safeTraverse(received_endpoint, ["appendContinuationItemsAction", "continuationItems"], default=[]) + rcic = safeTraverse(received_endpoint, ["reloadContinuationItemsCommand", "continuationItems"], default=[]) + + for entry in acia: + if "commentThreadRenderer" in entry or "continuationItemRenderer" in entry: + comment_continuations = acia + break + + for entry in rcic: + if "commentThreadRenderer" in entry or "continuationItemRenderer" in entry: + comment_continuations = rcic + break + + if comment_continuations != []: + break + + if comment_continuations == []: + print("error: received an unknown comment structure, unable to parse continuations (replies)") + # breakpoint() + # return [], "" + + # extract new continuation + new_continuation = "" + if "continuationItemRenderer" in safeTraverse(comment_continuations, [-1], default=[]): + # first, look for ctoken inside of response for next page of comments + new_continuation = safeTraverse(comment_continuations, [-1, "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token"], default=None) + # or search elsewhere in case this is a reply thread + if new_continuation is None: + new_continuation = safeTraverse(comment_continuations, [-1, "continuationItemRenderer", "button", "buttonRenderer", "command", "continuationCommand", "token"], default="") + + # perform a basic mutation check before parsing + # will ignore replies liked by video uploader ("hearts") + actual_comments = [x for x in comments if "properties" in safeTraverse(x, ["payload", "commentEntityPayload"], default=[], quiet=True)] + actual_comment_continuations = [x for x in comment_continuations if "replies" in safeTraverse(x, ["commentThreadRenderer"], default=[], quiet=True)] + + # link reply data (reply count and ctoken) for comments with replies + for reply_renderer in actual_comment_continuations: + + mutual_key = safeTraverse(reply_renderer, ["commentThreadRenderer", "commentViewModel", "commentViewModel", "commentKey"], default="unknown-key") + reply_ctoken = safeTraverse(reply_renderer, ["commentThreadRenderer", "replies", "commentRepliesRenderer", "contents", 0, "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token"], default="") + reply_count = safeTraverse(reply_renderer, ["commentThreadRenderer", "replies", "commentRepliesRenderer", "viewReplies", "buttonRenderer", "text", "runs", 0, "text"], default="0 replies").split(" ")[0] + + for comment in actual_comments: + found_key = safeTraverse(comment, ["entityKey"], default="unknown-key") + # try to link a relevant ctoken if a comment has response + if found_key == mutual_key: + if ythdd_globals.config["general"]["debug"]: print(f"found reply for {found_key}") + comment["replies"] = { + "replyCount": int(reply_count), + "continuation": reply_ctoken + } + + return actual_comments, new_continuation diff --git a/ythdd_globals.py b/ythdd_globals.py index 6484539..9277b16 100644 --- a/ythdd_globals.py +++ b/ythdd_globals.py @@ -23,7 +23,7 @@ version = "0.0.1" apiVersion = "1" randomly_generated_passcode = 0 video_cache = {} -general_cache = {"search": [], "continuations": {"channels": {}}, "channels": {}} +general_cache = {"search": [], "continuations": {"channels": {}, "comments": {}}, "channels": {}} def getConfig(configfile): diff --git a/ythdd_inv_tl.py b/ythdd_inv_tl.py index f29a38f..a89b9a0 100644 --- a/ythdd_inv_tl.py +++ b/ythdd_inv_tl.py @@ -449,6 +449,12 @@ def videos(data): premium = True # TODO: detect paywalled patron-only videos + # because we fetched the video's wdata, we might as + # well save it inside of general cache so that + # requests for the video's comments don't have to + # spawn an additional request for initial ctoken + ensure_comment_continuation(video_id, wdata) + time_end = time() response = { @@ -680,10 +686,70 @@ def get_channel_tab(requested_tab, ucid, req, only_json: bool = False): return send(200, response) +def get_comments(data, req, only_json: bool = False): + + # get comment continuation + ctoken = req.args.get('continuation') + + # perform some basic video id validation + if len(data) < 4 or len(data) >= 4 and len(data[3]) != 11: + return send(400, {"error": "Bad request: invalid videoId."}) + + video_id = data[3] + + # if ctoken isn't provided, get it from the general cache + if ctoken is None or ctoken == '': + # but first ensure it's there + ensure_comment_continuation(video_id) + ctoken = ythdd_globals.general_cache["continuations"]["comments"][video_id][0] + + # get joined video comment models + wdata, new_continuation = ythdd_extractor.WEBgetVideoComments(ctoken) + + comments = [] + for comment in wdata: + # parse the comment + parsed_comment = ythdd_struct_parser.customCommentRendererParser(comment) + if parsed_comment is not None: + comments.append(parsed_comment) + + response = { + "videoId": video_id, + "comments": comments, + "continuation": new_continuation + } + + if only_json: + return response + + return send(200, response) + + +def ensure_comment_continuation(video_id: str, wdata = None): + + # save continutation token for comments in global comment cache + if not video_id in ythdd_globals.general_cache["continuations"]["comments"]: + ythdd_globals.general_cache["continuations"]["comments"][video_id] = [] + + if wdata is None: + # perhaps saving related videos to cache might be not a bad idea? + wdata = ythdd_extractor.WEBextractSinglePage(video_id) + + # search for "top comments" continuation token + comment_continuation = safeTraverse(wdata, ["ec2", "engagementPanels", 0, "engagementPanelSectionListRenderer", "header", "engagementPanelTitleHeaderRenderer", "menu", "sortFilterSubMenuRenderer", "subMenuItems", 0, "serviceEndpoint", "continuationCommand", "token"], default=None) + if comment_continuation is not None: + ythdd_globals.general_cache["continuations"]["comments"][video_id].append(comment_continuation) + else: + print(f"error: couldn't extract comment continuation token from video page ({video_id})") + def channels(data, req, only_json: bool = False): + # prevent potential out of bound read + if len(data) < 4: + return send(400, {"error": "No channel specified."}) + + # silly sanity check if len(data[3]) != 24 or not data[3].startswith("UC"): - # silly sanity check return send(404, {"error": "This channel does not exist."}) if len(data) > 4: @@ -775,6 +841,8 @@ def lookup(data, req): return search(data, req) case 'channels': return channels(data, req) + case 'comments': + return get_comments(data, req) case _: incrementBadRequests() return notImplemented(data) diff --git a/ythdd_struct_parser.py b/ythdd_struct_parser.py index 5dcc826..45b3f50 100644 --- a/ythdd_struct_parser.py +++ b/ythdd_struct_parser.py @@ -304,6 +304,46 @@ def parseRenderers(entry: dict, context: dict = {}) -> dict: # breakpoint() return +def customCommentRendererParser(comment: dict, context: dict = {}) -> dict: + + cep = safeTraverse(comment, ["payload", "commentEntityPayload"], default={}) + content = safeTraverse(cep, ["properties", "content", "content"], default="") + content_html = escape(content).replace("\r\n", "
").replace("\n", "
") + author = safeTraverse(cep, ["author"], default={}) + verified = safeTraverse(author, ["isVerified"], default=False) or safeTraverse(author, ["isArtist"], default=False) + ucid = safeTraverse(author, ["channelId"], default="UNKNOWNCHANNELID") + published_date = safeTraverse(cep, ["properties", "publishedTime"], default="now") + edited = False + + if published_date.endswith(" (edited)"): + edited = True + published_date_unix = int(dateparser.parse(published_date.removesuffix(" (edited)")).timestamp()) + else: + published_date_unix = int(dateparser.parse(published_date).timestamp()) + + inv_comment = { + "authorId": ucid, + "authorUrl": "/channel/" + ucid, + "author": safeTraverse(author, ["displayName"], default="@ythdd-unknown-user"), + "verified": verified, + "authorThumbnails": ythdd_extractor.generateChannelAvatarsFromUrl(safeTraverse(author, ["avatarThumbnailUrl"], default=DEFAULT_AVATAR)), # proxy them! + "authorIsChannelOwner": safeTraverse(author, ["isCreator"], default=False), # ??? + "isSponsor": False, # not sure how to retrieve this + "likeCount": parseViewsFromViewText("0" + safeTraverse(cep, ["toolbar", "likeCountNotliked"], default="0") + " likes"), + "isPinned": False, + "commentId": safeTraverse(cep, ["properties", "commentId"], default="UNKNOWNCOMMENTID"), + "content": content, + "contentHtml": content_html, + "isEdited": edited, + "published": published_date_unix, + "publishedText": published_date if published_date != "now" else "unknown amount of time ago" + } + + if "replies" in comment: + inv_comment["replies"] = comment["replies"] + + return inv_comment + def parseDescriptionSnippet(snippet: list): text = ""