17
17
ExtractorError ,
18
18
int_or_none ,
19
19
float_or_none ,
20
+ mimetype2ext ,
20
21
parse_iso8601 ,
21
22
traverse_obj ,
22
- try_get ,
23
23
parse_count ,
24
24
smuggle_url ,
25
25
srt_subtitles_timecode ,
@@ -53,15 +53,13 @@ class BiliBiliIE(InfoExtractor):
53
53
'md5' : '5f7d29e1a2872f3df0cf76b1f87d3788' ,
54
54
'info_dict' : {
55
55
'id' : '1074402' ,
56
- 'ext' : 'flv ' ,
56
+ 'ext' : 'mp4 ' ,
57
57
'title' : '【金坷垃】金泡沫' ,
58
+ 'uploader_id' : '156160' ,
59
+ 'uploader' : '菊子桑' ,
60
+ 'upload_date' : '20140420' ,
58
61
'description' : 'md5:ce18c2a2d2193f0df2917d270f2e5923' ,
59
- 'duration' : 308.067 ,
60
62
'timestamp' : 1398012678 ,
61
- 'upload_date' : '20140420' ,
62
- 'thumbnail' : r're:^https?://.+\.jpg' ,
63
- 'uploader' : '菊子桑' ,
64
- 'uploader_id' : '156160' ,
65
63
},
66
64
}, {
67
65
# Tested in BiliBiliBangumiIE
@@ -82,42 +80,20 @@ class BiliBiliIE(InfoExtractor):
82
80
},
83
81
'skip' : 'Geo-restricted to China' ,
84
82
}, {
85
- # Title with double quotes
86
83
'url' : 'http://www.bilibili.com/video/av8903802/' ,
87
84
'info_dict' : {
88
85
'id' : '8903802' ,
86
+ 'ext' : 'mp4' ,
89
87
'title' : '阿滴英文|英文歌分享#6 "Closer' ,
88
+ 'upload_date' : '20170301' ,
90
89
'description' : '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文' ,
90
+ 'timestamp' : 1488382634 ,
91
+ 'uploader_id' : '65880958' ,
92
+ 'uploader' : '阿滴英文' ,
93
+ },
94
+ 'params' : {
95
+ 'skip_download' : True ,
91
96
},
92
- 'playlist' : [{
93
- 'info_dict' : {
94
- 'id' : '8903802_part1' ,
95
- 'ext' : 'flv' ,
96
- 'title' : '阿滴英文|英文歌分享#6 "Closer' ,
97
- 'description' : 'md5:3b1b9e25b78da4ef87e9b548b88ee76a' ,
98
- 'uploader' : '阿滴英文' ,
99
- 'uploader_id' : '65880958' ,
100
- 'timestamp' : 1488382634 ,
101
- 'upload_date' : '20170301' ,
102
- },
103
- 'params' : {
104
- 'skip_download' : True ,
105
- },
106
- }, {
107
- 'info_dict' : {
108
- 'id' : '8903802_part2' ,
109
- 'ext' : 'flv' ,
110
- 'title' : '阿滴英文|英文歌分享#6 "Closer' ,
111
- 'description' : 'md5:3b1b9e25b78da4ef87e9b548b88ee76a' ,
112
- 'uploader' : '阿滴英文' ,
113
- 'uploader_id' : '65880958' ,
114
- 'timestamp' : 1488382634 ,
115
- 'upload_date' : '20170301' ,
116
- },
117
- 'params' : {
118
- 'skip_download' : True ,
119
- },
120
- }]
121
97
}, {
122
98
# new BV video id format
123
99
'url' : 'https://www.bilibili.com/video/BV1JE411F741' ,
@@ -152,6 +128,7 @@ def _real_extract(self, url):
152
128
av_id , bv_id = self ._get_video_id_set (video_id , mobj .group ('id_bv' ) is not None )
153
129
video_id = av_id
154
130
131
+ info = {}
155
132
anime_id = mobj .group ('anime_id' )
156
133
page_id = mobj .group ('page' )
157
134
webpage = self ._download_webpage (url , video_id )
@@ -203,66 +180,95 @@ def _real_extract(self, url):
203
180
}
204
181
headers .update (self .geo_verification_headers ())
205
182
183
+ video_info = self ._parse_json (
184
+ self ._search_regex (r'window.__playinfo__\s*=\s*({.+?})</script>' , webpage , 'video info' , default = None ),
185
+ video_id , fatal = False ) or {}
186
+ video_info = video_info .get ('data' ) or {}
187
+
188
+ durl = traverse_obj (video_info , ('dash' , 'video' ))
189
+ audios = traverse_obj (video_info , ('dash' , 'audio' )) or []
206
190
entries = []
207
191
208
192
RENDITIONS = ('qn=80&quality=80&type=' , 'quality=2&type=mp4' )
209
193
for num , rendition in enumerate (RENDITIONS , start = 1 ):
210
194
payload = 'appkey=%s&cid=%s&otype=json&%s' % (self ._APP_KEY , cid , rendition )
211
195
sign = hashlib .md5 ((payload + self ._BILIBILI_KEY ).encode ('utf-8' )).hexdigest ()
212
-
213
- video_info = self ._download_json (
214
- 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload , sign ),
215
- video_id , note = 'Downloading video info page' ,
216
- headers = headers , fatal = num == len (RENDITIONS ))
217
-
218
196
if not video_info :
219
- continue
197
+ video_info = self ._download_json (
198
+ 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload , sign ),
199
+ video_id , note = 'Downloading video info page' ,
200
+ headers = headers , fatal = num == len (RENDITIONS ))
201
+ if not video_info :
202
+ continue
220
203
221
- if 'durl' not in video_info :
204
+ if not durl and 'durl' not in video_info :
222
205
if num < len (RENDITIONS ):
223
206
continue
224
207
self ._report_error (video_info )
225
208
226
- for idx , durl in enumerate (video_info ['durl' ]):
227
- formats = [{
228
- 'url' : durl ['url' ],
229
- 'filesize' : int_or_none (durl ['size' ]),
230
- }]
231
- for backup_url in durl .get ('backup_url' , []):
209
+ formats = []
210
+ for idx , durl in enumerate (durl or video_info ['durl' ]):
211
+ formats .append ({
212
+ 'url' : durl .get ('baseUrl' ) or durl .get ('base_url' ) or durl .get ('url' ),
213
+ 'ext' : mimetype2ext (durl .get ('mimeType' ) or durl .get ('mime_type' )),
214
+ 'fps' : int_or_none (durl .get ('frameRate' ) or durl .get ('frame_rate' )),
215
+ 'width' : int_or_none (durl .get ('width' )),
216
+ 'height' : int_or_none (durl .get ('height' )),
217
+ 'vcodec' : durl .get ('codecs' ),
218
+ 'acodec' : 'none' if audios else None ,
219
+ 'tbr' : float_or_none (durl .get ('bandwidth' ), scale = 1000 ),
220
+ 'filesize' : int_or_none (durl .get ('size' )),
221
+ })
222
+ for backup_url in traverse_obj (durl , 'backup_url' , expected_type = list ) or []:
232
223
formats .append ({
233
224
'url' : backup_url ,
234
- # backup URLs have lower priorities
235
225
'quality' : - 2 if 'hd.mp4' in backup_url else - 3 ,
236
226
})
237
227
238
228
for a_format in formats :
239
229
a_format .setdefault ('http_headers' , {}).update ({
240
230
'Referer' : url ,
241
231
})
242
-
243
- self ._sort_formats (formats )
244
-
245
- entries .append ({
246
- 'id' : '%s_part%s' % (video_id , idx ),
247
- 'duration' : float_or_none (durl .get ('length' ), 1000 ),
248
- 'formats' : formats ,
232
+ for audio in audios :
233
+ formats .append ({
234
+ 'url' : audio .get ('baseUrl' ) or audio .get ('base_url' ) or audio .get ('url' ),
235
+ 'ext' : mimetype2ext (audio .get ('mimeType' ) or audio .get ('mime_type' )),
236
+ 'fps' : int_or_none (audio .get ('frameRate' ) or audio .get ('frame_rate' )),
237
+ 'width' : int_or_none (audio .get ('width' )),
238
+ 'height' : int_or_none (audio .get ('height' )),
239
+ 'acodec' : audio .get ('codecs' ),
240
+ 'vcodec' : 'none' ,
241
+ 'tbr' : float_or_none (audio .get ('bandwidth' ), scale = 1000 ),
242
+ 'filesize' : int_or_none (audio .get ('size' ))
249
243
})
244
+ for backup_url in traverse_obj (audio , 'backup_url' , expected_type = list ) or []:
245
+ formats .append ({
246
+ 'url' : backup_url ,
247
+ # backup URLs have lower priorities
248
+ 'quality' : - 3 ,
249
+ })
250
+
251
+ info .update ({
252
+ 'id' : video_id ,
253
+ 'duration' : float_or_none (durl .get ('length' ), 1000 ),
254
+ 'formats' : formats ,
255
+ })
250
256
break
251
257
258
+ self ._sort_formats (formats )
259
+
252
260
title = self ._html_search_regex (
253
- (r'<h1[^>]+\btitle =(["\'])(?P<title>(?:(?!\1).)+)\1 ' ,
261
+ (r'<h1[^>]+title =(["\'])(?P<title>[^"\']+) ' ,
254
262
r'(?s)<h1[^>]*>(?P<title>.+?)</h1>' ), webpage , 'title' ,
255
263
group = 'title' , fatal = False )
256
264
257
265
# Get part title for anthologies
258
266
if page_id is not None :
259
- # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video
260
- part_title = try_get (
261
- self ._download_json (
262
- f'https://api.bilibili.com/x/player/pagelist?bvid={ bv_id } &jsonp=jsonp' ,
263
- video_id , note = 'Extracting videos in anthology' ),
264
- lambda x : x ['data' ][int (page_id ) - 1 ]['part' ])
265
- title = part_title or title
267
+ # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video.
268
+ part_info = traverse_obj (self ._download_json (
269
+ f'https://api.bilibili.com/x/player/pagelist?bvid={ bv_id } &jsonp=jsonp' ,
270
+ video_id , note = 'Extracting videos in anthology' ), 'data' , expected_type = list )
271
+ title = title if len (part_info ) == 1 else traverse_obj (part_info , (int (page_id ) - 1 , 'part' )) or title
266
272
267
273
description = self ._html_search_meta ('description' , webpage )
268
274
timestamp = unified_timestamp (self ._html_search_regex (
@@ -272,15 +278,15 @@ def _real_extract(self, url):
272
278
thumbnail = self ._html_search_meta (['og:image' , 'thumbnailUrl' ], webpage )
273
279
274
280
# TODO 'view_count' requires deobfuscating Javascript
275
- info = {
281
+ info . update ( {
276
282
'id' : str (video_id ) if page_id is None else '%s_part%s' % (video_id , page_id ),
277
283
'cid' : cid ,
278
284
'title' : title ,
279
285
'description' : description ,
280
286
'timestamp' : timestamp ,
281
287
'thumbnail' : thumbnail ,
282
288
'duration' : float_or_none (video_info .get ('timelength' ), scale = 1000 ),
283
- }
289
+ })
284
290
285
291
uploader_mobj = re .search (
286
292
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<' ,
@@ -301,7 +307,7 @@ def _real_extract(self, url):
301
307
video_id , fatal = False , note = 'Downloading tags' ), ('data' , ..., 'tag_name' )),
302
308
}
303
309
304
- entries [ 0 ] ['subtitles' ] = {
310
+ info ['subtitles' ] = {
305
311
'danmaku' : [{
306
312
'ext' : 'xml' ,
307
313
'url' : f'https://comment.bilibili.com/{ cid } .xml' ,
@@ -336,12 +342,10 @@ def _real_extract(self, url):
336
342
entry ['id' ] = '%s_part%d' % (video_id , (idx + 1 ))
337
343
338
344
return {
339
- '_type' : 'multi_video' ,
340
345
'id' : str (video_id ),
341
346
'bv_id' : bv_id ,
342
347
'title' : title ,
343
348
'description' : description ,
344
- 'entries' : entries ,
345
349
** info , ** top_level_info
346
350
}
347
351
@@ -482,9 +486,9 @@ def _entries(self, list_id):
482
486
data = self ._download_json (
483
487
self ._API_URL % (list_id , page_num ), list_id , note = f'Downloading page { page_num } ' )['data' ]
484
488
485
- max_count = max_count or try_get (data , lambda x : x [ 'page' ][ 'count' ] )
489
+ max_count = max_count or traverse_obj (data , ( 'page' , 'count' ) )
486
490
487
- entries = try_get (data , lambda x : x [ 'list' ][ 'vlist' ] )
491
+ entries = traverse_obj (data , ( 'list' , 'vlist' ) )
488
492
if not entries :
489
493
return
490
494
for entry in entries :
@@ -522,7 +526,7 @@ def _fetch_page(self, api_url, num_pages, query, page_num):
522
526
api_url , query , query = {'Search_key' : query , 'pn' : page_num },
523
527
note = 'Extracting results from page %s of %s' % (page_num , num_pages ))
524
528
525
- video_list = try_get (parsed_json , lambda x : x [ 'data' ][ 'archives' ], list )
529
+ video_list = traverse_obj (parsed_json , ( 'data' , 'archives' ), expected_type = list )
526
530
if not video_list :
527
531
raise ExtractorError ('Failed to retrieve video list for page %d' % page_num )
528
532
@@ -552,7 +556,7 @@ def _entries(self, category, subcategory, query):
552
556
553
557
api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value
554
558
page_json = self ._download_json (api_url , query , query = {'Search_key' : query , 'pn' : '1' })
555
- page_data = try_get (page_json , lambda x : x [ 'data' ][ 'page' ], dict )
559
+ page_data = traverse_obj (page_json , ( 'data' , 'page' ), expected_type = dict )
556
560
count , size = int_or_none (page_data .get ('count' )), int_or_none (page_data .get ('size' ))
557
561
if count is None or not size :
558
562
raise ExtractorError ('Failed to calculate either page count or size' )
0 commit comments