Skip to content

Commit f8580bf

Browse files
authored
[Bilibili] Add 8k support (yt-dlp#1964)
Closes yt-dlp#1898, yt-dlp#1819 Authored by: u-spec-png
1 parent 19afd9e commit f8580bf

File tree

1 file changed

+80
-76
lines changed

1 file changed

+80
-76
lines changed

yt_dlp/extractor/bilibili.py

Lines changed: 80 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
ExtractorError,
1818
int_or_none,
1919
float_or_none,
20+
mimetype2ext,
2021
parse_iso8601,
2122
traverse_obj,
22-
try_get,
2323
parse_count,
2424
smuggle_url,
2525
srt_subtitles_timecode,
@@ -53,15 +53,13 @@ class BiliBiliIE(InfoExtractor):
5353
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
5454
'info_dict': {
5555
'id': '1074402',
56-
'ext': 'flv',
56+
'ext': 'mp4',
5757
'title': '【金坷垃】金泡沫',
58+
'uploader_id': '156160',
59+
'uploader': '菊子桑',
60+
'upload_date': '20140420',
5861
'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
59-
'duration': 308.067,
6062
'timestamp': 1398012678,
61-
'upload_date': '20140420',
62-
'thumbnail': r're:^https?://.+\.jpg',
63-
'uploader': '菊子桑',
64-
'uploader_id': '156160',
6563
},
6664
}, {
6765
# Tested in BiliBiliBangumiIE
@@ -82,42 +80,20 @@ class BiliBiliIE(InfoExtractor):
8280
},
8381
'skip': 'Geo-restricted to China',
8482
}, {
85-
# Title with double quotes
8683
'url': 'http://www.bilibili.com/video/av8903802/',
8784
'info_dict': {
8885
'id': '8903802',
86+
'ext': 'mp4',
8987
'title': '阿滴英文|英文歌分享#6 "Closer',
88+
'upload_date': '20170301',
9089
'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
90+
'timestamp': 1488382634,
91+
'uploader_id': '65880958',
92+
'uploader': '阿滴英文',
93+
},
94+
'params': {
95+
'skip_download': True,
9196
},
92-
'playlist': [{
93-
'info_dict': {
94-
'id': '8903802_part1',
95-
'ext': 'flv',
96-
'title': '阿滴英文|英文歌分享#6 "Closer',
97-
'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
98-
'uploader': '阿滴英文',
99-
'uploader_id': '65880958',
100-
'timestamp': 1488382634,
101-
'upload_date': '20170301',
102-
},
103-
'params': {
104-
'skip_download': True,
105-
},
106-
}, {
107-
'info_dict': {
108-
'id': '8903802_part2',
109-
'ext': 'flv',
110-
'title': '阿滴英文|英文歌分享#6 "Closer',
111-
'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
112-
'uploader': '阿滴英文',
113-
'uploader_id': '65880958',
114-
'timestamp': 1488382634,
115-
'upload_date': '20170301',
116-
},
117-
'params': {
118-
'skip_download': True,
119-
},
120-
}]
12197
}, {
12298
# new BV video id format
12399
'url': 'https://www.bilibili.com/video/BV1JE411F741',
@@ -152,6 +128,7 @@ def _real_extract(self, url):
152128
av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None)
153129
video_id = av_id
154130

131+
info = {}
155132
anime_id = mobj.group('anime_id')
156133
page_id = mobj.group('page')
157134
webpage = self._download_webpage(url, video_id)
@@ -203,66 +180,95 @@ def _real_extract(self, url):
203180
}
204181
headers.update(self.geo_verification_headers())
205182

183+
video_info = self._parse_json(
184+
self._search_regex(r'window.__playinfo__\s*=\s*({.+?})</script>', webpage, 'video info', default=None),
185+
video_id, fatal=False) or {}
186+
video_info = video_info.get('data') or {}
187+
188+
durl = traverse_obj(video_info, ('dash', 'video'))
189+
audios = traverse_obj(video_info, ('dash', 'audio')) or []
206190
entries = []
207191

208192
RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
209193
for num, rendition in enumerate(RENDITIONS, start=1):
210194
payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
211195
sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
212-
213-
video_info = self._download_json(
214-
'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
215-
video_id, note='Downloading video info page',
216-
headers=headers, fatal=num == len(RENDITIONS))
217-
218196
if not video_info:
219-
continue
197+
video_info = self._download_json(
198+
'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
199+
video_id, note='Downloading video info page',
200+
headers=headers, fatal=num == len(RENDITIONS))
201+
if not video_info:
202+
continue
220203

221-
if 'durl' not in video_info:
204+
if not durl and 'durl' not in video_info:
222205
if num < len(RENDITIONS):
223206
continue
224207
self._report_error(video_info)
225208

226-
for idx, durl in enumerate(video_info['durl']):
227-
formats = [{
228-
'url': durl['url'],
229-
'filesize': int_or_none(durl['size']),
230-
}]
231-
for backup_url in durl.get('backup_url', []):
209+
formats = []
210+
for idx, durl in enumerate(durl or video_info['durl']):
211+
formats.append({
212+
'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'),
213+
'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')),
214+
'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')),
215+
'width': int_or_none(durl.get('width')),
216+
'height': int_or_none(durl.get('height')),
217+
'vcodec': durl.get('codecs'),
218+
'acodec': 'none' if audios else None,
219+
'tbr': float_or_none(durl.get('bandwidth'), scale=1000),
220+
'filesize': int_or_none(durl.get('size')),
221+
})
222+
for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []:
232223
formats.append({
233224
'url': backup_url,
234-
# backup URLs have lower priorities
235225
'quality': -2 if 'hd.mp4' in backup_url else -3,
236226
})
237227

238228
for a_format in formats:
239229
a_format.setdefault('http_headers', {}).update({
240230
'Referer': url,
241231
})
242-
243-
self._sort_formats(formats)
244-
245-
entries.append({
246-
'id': '%s_part%s' % (video_id, idx),
247-
'duration': float_or_none(durl.get('length'), 1000),
248-
'formats': formats,
232+
for audio in audios:
233+
formats.append({
234+
'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'),
235+
'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')),
236+
'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')),
237+
'width': int_or_none(audio.get('width')),
238+
'height': int_or_none(audio.get('height')),
239+
'acodec': audio.get('codecs'),
240+
'vcodec': 'none',
241+
'tbr': float_or_none(audio.get('bandwidth'), scale=1000),
242+
'filesize': int_or_none(audio.get('size'))
249243
})
244+
for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []:
245+
formats.append({
246+
'url': backup_url,
247+
# backup URLs have lower priorities
248+
'quality': -3,
249+
})
250+
251+
info.update({
252+
'id': video_id,
253+
'duration': float_or_none(durl.get('length'), 1000),
254+
'formats': formats,
255+
})
250256
break
251257

258+
self._sort_formats(formats)
259+
252260
title = self._html_search_regex(
253-
(r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
261+
(r'<h1[^>]+title=(["\'])(?P<title>[^"\']+)',
254262
r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
255263
group='title', fatal=False)
256264

257265
# Get part title for anthologies
258266
if page_id is not None:
259-
# TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video
260-
part_title = try_get(
261-
self._download_json(
262-
f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
263-
video_id, note='Extracting videos in anthology'),
264-
lambda x: x['data'][int(page_id) - 1]['part'])
265-
title = part_title or title
267+
# TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video.
268+
part_info = traverse_obj(self._download_json(
269+
f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp',
270+
video_id, note='Extracting videos in anthology'), 'data', expected_type=list)
271+
title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title
266272

267273
description = self._html_search_meta('description', webpage)
268274
timestamp = unified_timestamp(self._html_search_regex(
@@ -272,15 +278,15 @@ def _real_extract(self, url):
272278
thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
273279

274280
# TODO 'view_count' requires deobfuscating Javascript
275-
info = {
281+
info.update({
276282
'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id),
277283
'cid': cid,
278284
'title': title,
279285
'description': description,
280286
'timestamp': timestamp,
281287
'thumbnail': thumbnail,
282288
'duration': float_or_none(video_info.get('timelength'), scale=1000),
283-
}
289+
})
284290

285291
uploader_mobj = re.search(
286292
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<',
@@ -301,7 +307,7 @@ def _real_extract(self, url):
301307
video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')),
302308
}
303309

304-
entries[0]['subtitles'] = {
310+
info['subtitles'] = {
305311
'danmaku': [{
306312
'ext': 'xml',
307313
'url': f'https://comment.bilibili.com/{cid}.xml',
@@ -336,12 +342,10 @@ def _real_extract(self, url):
336342
entry['id'] = '%s_part%d' % (video_id, (idx + 1))
337343

338344
return {
339-
'_type': 'multi_video',
340345
'id': str(video_id),
341346
'bv_id': bv_id,
342347
'title': title,
343348
'description': description,
344-
'entries': entries,
345349
**info, **top_level_info
346350
}
347351

@@ -482,9 +486,9 @@ def _entries(self, list_id):
482486
data = self._download_json(
483487
self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data']
484488

485-
max_count = max_count or try_get(data, lambda x: x['page']['count'])
489+
max_count = max_count or traverse_obj(data, ('page', 'count'))
486490

487-
entries = try_get(data, lambda x: x['list']['vlist'])
491+
entries = traverse_obj(data, ('list', 'vlist'))
488492
if not entries:
489493
return
490494
for entry in entries:
@@ -522,7 +526,7 @@ def _fetch_page(self, api_url, num_pages, query, page_num):
522526
api_url, query, query={'Search_key': query, 'pn': page_num},
523527
note='Extracting results from page %s of %s' % (page_num, num_pages))
524528

525-
video_list = try_get(parsed_json, lambda x: x['data']['archives'], list)
529+
video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list)
526530
if not video_list:
527531
raise ExtractorError('Failed to retrieve video list for page %d' % page_num)
528532

@@ -552,7 +556,7 @@ def _entries(self, category, subcategory, query):
552556

553557
api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
554558
page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'})
555-
page_data = try_get(page_json, lambda x: x['data']['page'], dict)
559+
page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict)
556560
count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size'))
557561
if count is None or not size:
558562
raise ExtractorError('Failed to calculate either page count or size')

0 commit comments

Comments
 (0)