|
1 | 1 | from requests_html import HTMLSession |
2 | 2 | from bs4 import BeautifulSoup as bs |
| 3 | +import re |
| 4 | +import json |
3 | 5 |
|
4 | 6 | # init session |
5 | 7 | session = HTMLSession() |
@@ -27,22 +29,30 @@ def get_video_info(url): |
27 | 29 | result["duration"] = soup.find("span", {"class": "ytp-time-duration"}).text |
28 | 30 | # get the video tags |
29 | 31 | result["tags"] = ', '.join([ meta.attrs.get("content") for meta in soup.find_all("meta", {"property": "og:video:tag"}) ]) |
30 | | - # number of likes |
31 | | - text_yt_formatted_strings = soup.find_all("yt-formatted-string", {"id": "text", "class": "ytd-toggle-button-renderer"}) |
32 | | - result["likes"] = ''.join([ c for c in text_yt_formatted_strings[0].attrs.get("aria-label") if c.isdigit() ]) |
33 | | - result["likes"] = 0 if result['likes'] == '' else int(result['likes']) |
34 | | - # number of dislikes |
35 | | - result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ]) |
36 | | - result['dislikes'] = 0 if result['dislikes'] == '' else int(result['dislikes']) |
37 | 32 |
|
| 33 | + # Additional video and channel information (with help from: https://stackoverflow.com/a/68262735) |
| 34 | + data = re.search(r"var ytInitialData = ({.*?});", soup.prettify()).group(1) |
| 35 | + data_json = json.loads(data) |
| 36 | + videoPrimaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer'] |
| 37 | + videoSecondaryInfoRenderer = data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer'] |
| 38 | + # number of likes |
| 39 | + likes_label = videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label'] # "No likes" or "###,### likes" |
| 40 | + likes_str = likes_label.split(' ')[0].replace(',','') |
| 41 | + result["likes"] = '0' if likes_str == 'No' else likes_str |
| 42 | + # number of dislikes - YouTube does not publish this anymore...? |
| 43 | + # result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ]) |
| 44 | + # result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes'] |
| 45 | + result['dislikes'] = 'UNKNOWN' |
| 46 | + |
38 | 47 | # channel details |
39 | | - channel_tag = soup.find("yt-formatted-string", {"class": "ytd-channel-name"}).find("a") |
| 48 | + channel_tag = soup.find("meta", itemprop="channelId")['content'] |
40 | 49 | # channel name |
41 | | - channel_name = channel_tag.text |
| 50 | + channel_name = soup.find("span", itemprop="author").next.next['content'] |
42 | 51 | # channel URL |
43 | | - channel_url = f"https://www.youtube.com{channel_tag['href']}" |
| 52 | + # channel_url = soup.find("span", itemprop="author").next['href'] |
| 53 | + channel_url = f"https://www.youtube.com{channel_tag}" |
44 | 54 | # number of subscribers as str |
45 | | - channel_subscribers = soup.find("yt-formatted-string", {"id": "owner-sub-count"}).text.strip() |
| 55 | + channel_subscribers = videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label'] |
46 | 56 | result['channel'] = {'name': channel_name, 'url': channel_url, 'subscribers': channel_subscribers} |
47 | 57 | return result |
48 | 58 |
|
|
0 commit comments