Skip to content

Commit b695e3f

Browse files
committed
[orf:tvthek] Lazy playlist extraction and obey --no-playlist
Closes yt-dlp#2411
1 parent 6a5a30f commit b695e3f

File tree

1 file changed

+126
-100
lines changed

1 file changed

+126
-100
lines changed

yt_dlp/extractor/orf.py

Lines changed: 126 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,68 @@
11
# coding: utf-8
22
from __future__ import unicode_literals
33

4+
import functools
45
import re
56

67
from .common import InfoExtractor
7-
from ..compat import compat_str
88
from ..utils import (
99
clean_html,
1010
determine_ext,
1111
float_or_none,
1212
HEADRequest,
13+
InAdvancePagedList,
1314
int_or_none,
1415
join_nonempty,
1516
orderedSet,
1617
remove_end,
18+
smuggle_url,
1719
str_or_none,
1820
strip_jsonp,
1921
unescapeHTML,
2022
unified_strdate,
23+
unsmuggle_url,
2124
url_or_none,
25+
urljoin,
2226
)
2327

2428

2529
class ORFTVthekIE(InfoExtractor):
2630
IE_NAME = 'orf:tvthek'
2731
IE_DESC = 'ORF TVthek'
28-
_VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
32+
_VALID_URL = r'(?P<url>https?://tvthek\.orf\.at/(?:(?:[^/]+/){2}){1,2}(?P<id>\d+))(/[^/]+/(?P<vid>\d+))?(?:$|[?#])'
2933

3034
_TESTS = [{
35+
'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079',
36+
'info_dict': {
37+
'id': '14121079',
38+
},
39+
'playlist_count': 11,
40+
'params': {'noplaylist': True}
41+
}, {
42+
'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150',
43+
'info_dict': {
44+
'id': '14121079',
45+
},
46+
'playlist_count': 1,
47+
'params': {'playlist_items': '5'}
48+
}, {
49+
'url': 'https://tvthek.orf.at/profile/ZIB-2/1211/ZIB-2/14121079/Umfrage-Welches-Tier-ist-Sebastian-Kurz/15083150',
50+
'info_dict': {
51+
'id': '14121079',
52+
'playlist_count': 1
53+
},
54+
'playlist': [{
55+
'info_dict': {
56+
'id': '15083150',
57+
'ext': 'mp4',
58+
'description': 'md5:7be1c485425f5f255a5e4e4815e77d04',
59+
'thumbnail': 'https://api-tvthek.orf.at/uploads/media/segments/0130/59/824271ea35cd8931a0fb08ab316a5b0a1562342c.jpeg',
60+
'title': 'Umfrage: Welches Tier ist Sebastian Kurz?',
61+
}
62+
}],
63+
'playlist_count': 1,
64+
'params': {'noplaylist': True, 'skip_download': 'm3u8'}
65+
}, {
3166
'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
3267
'playlist': [{
3368
'md5': '2942210346ed779588f428a92db88712',
@@ -62,8 +97,90 @@ class ORFTVthekIE(InfoExtractor):
6297
'only_matching': True,
6398
}]
6499

100+
def _pagefunc(self, url, data_jsb, n, *, image=None):
101+
sd = data_jsb[n]
102+
video_id, title = str(sd['id']), sd['title']
103+
formats = []
104+
for fd in sd['sources']:
105+
src = url_or_none(fd.get('src'))
106+
if not src:
107+
continue
108+
format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd)
109+
ext = determine_ext(src)
110+
if ext == 'm3u8':
111+
m3u8_formats = self._extract_m3u8_formats(
112+
src, video_id, 'mp4', m3u8_id=format_id, fatal=False, note=f'Downloading {format_id} m3u8 manifest')
113+
if any('/geoprotection' in f['url'] for f in m3u8_formats):
114+
self.raise_geo_restricted()
115+
formats.extend(m3u8_formats)
116+
elif ext == 'f4m':
117+
formats.extend(self._extract_f4m_formats(
118+
src, video_id, f4m_id=format_id, fatal=False))
119+
elif ext == 'mpd':
120+
formats.extend(self._extract_mpd_formats(
121+
src, video_id, mpd_id=format_id, fatal=False, note=f'Downloading {format_id} mpd manifest'))
122+
else:
123+
formats.append({
124+
'format_id': format_id,
125+
'url': src,
126+
'protocol': fd.get('protocol'),
127+
})
128+
129+
# Check for geoblocking.
130+
# There is a property is_geoprotection, but that's always false
131+
geo_str = sd.get('geoprotection_string')
132+
http_url = next(
133+
(f['url'] for f in formats if re.match(r'^https?://.*\.mp4$', f['url'])),
134+
None) if geo_str else None
135+
if http_url:
136+
self._request_webpage(
137+
HEADRequest(http_url), video_id, fatal=False, note='Testing for geoblocking',
138+
errnote=f'This video seems to be blocked outside of {geo_str}. You may want to try the streaming-* formats')
139+
140+
self._sort_formats(formats)
141+
142+
subtitles = {}
143+
for sub in sd.get('subtitles', []):
144+
sub_src = sub.get('src')
145+
if not sub_src:
146+
continue
147+
subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
148+
'url': sub_src,
149+
})
150+
151+
upload_date = unified_strdate(sd.get('created_date'))
152+
153+
thumbnails = []
154+
preview = sd.get('preview_image_url')
155+
if preview:
156+
thumbnails.append({
157+
'id': 'preview',
158+
'url': preview,
159+
'preference': 0,
160+
})
161+
image = sd.get('image_full_url') or image
162+
if image:
163+
thumbnails.append({
164+
'id': 'full',
165+
'url': image,
166+
'preference': 1,
167+
})
168+
169+
yield {
170+
'id': video_id,
171+
'title': title,
172+
'webpage_url': smuggle_url(f'{url}/part/{video_id}', {'force_noplaylist': True}),
173+
'formats': formats,
174+
'subtitles': subtitles,
175+
'description': sd.get('description'),
176+
'duration': int_or_none(sd.get('duration_in_seconds')),
177+
'upload_date': upload_date,
178+
'thumbnails': thumbnails,
179+
}
180+
65181
def _real_extract(self, url):
66-
playlist_id = self._match_id(url)
182+
url, smuggled_data = unsmuggle_url(url)
183+
playlist_id, video_id, base_url = self._match_valid_url(url).group('id', 'vid', 'url')
67184
webpage = self._download_webpage(url, playlist_id)
68185

69186
data_jsb = self._parse_json(
@@ -72,107 +189,16 @@ def _real_extract(self, url):
72189
webpage, 'playlist', group='json'),
73190
playlist_id, transform_source=unescapeHTML)['playlist']['videos']
74191

75-
entries = []
76-
for sd in data_jsb:
77-
video_id, title = sd.get('id'), sd.get('title')
78-
if not video_id or not title:
79-
continue
80-
video_id = compat_str(video_id)
81-
formats = []
82-
for fd in sd['sources']:
83-
src = url_or_none(fd.get('src'))
84-
if not src:
85-
continue
86-
format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd)
87-
ext = determine_ext(src)
88-
if ext == 'm3u8':
89-
m3u8_formats = self._extract_m3u8_formats(
90-
src, video_id, 'mp4', m3u8_id=format_id, fatal=False)
91-
if any('/geoprotection' in f['url'] for f in m3u8_formats):
92-
self.raise_geo_restricted()
93-
formats.extend(m3u8_formats)
94-
elif ext == 'f4m':
95-
formats.extend(self._extract_f4m_formats(
96-
src, video_id, f4m_id=format_id, fatal=False))
97-
elif ext == 'mpd':
98-
formats.extend(self._extract_mpd_formats(
99-
src, video_id, mpd_id=format_id, fatal=False))
100-
else:
101-
formats.append({
102-
'format_id': format_id,
103-
'url': src,
104-
'protocol': fd.get('protocol'),
105-
})
192+
if not self._yes_playlist(playlist_id, video_id, smuggled_data):
193+
data_jsb = [sd for sd in data_jsb if str(sd.get('id')) == video_id]
106194

107-
# Check for geoblocking.
108-
# There is a property is_geoprotection, but that's always false
109-
geo_str = sd.get('geoprotection_string')
110-
if geo_str:
111-
try:
112-
http_url = next(
113-
f['url']
114-
for f in formats
115-
if re.match(r'^https?://.*\.mp4$', f['url']))
116-
except StopIteration:
117-
pass
118-
else:
119-
req = HEADRequest(http_url)
120-
self._request_webpage(
121-
req, video_id,
122-
note='Testing for geoblocking',
123-
errnote=((
124-
'This video seems to be blocked outside of %s. '
125-
'You may want to try the streaming-* formats.')
126-
% geo_str),
127-
fatal=False)
128-
129-
self._check_formats(formats, video_id)
130-
self._sort_formats(formats)
131-
132-
subtitles = {}
133-
for sub in sd.get('subtitles', []):
134-
sub_src = sub.get('src')
135-
if not sub_src:
136-
continue
137-
subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
138-
'url': sub_src,
139-
})
140-
141-
upload_date = unified_strdate(sd.get('created_date'))
142-
143-
thumbnails = []
144-
preview = sd.get('preview_image_url')
145-
if preview:
146-
thumbnails.append({
147-
'id': 'preview',
148-
'url': preview,
149-
'preference': 0,
150-
})
151-
image = sd.get('image_full_url')
152-
if not image and len(data_jsb) == 1:
153-
image = self._og_search_thumbnail(webpage)
154-
if image:
155-
thumbnails.append({
156-
'id': 'full',
157-
'url': image,
158-
'preference': 1,
159-
})
160-
161-
entries.append({
162-
'_type': 'video',
163-
'id': video_id,
164-
'title': title,
165-
'formats': formats,
166-
'subtitles': subtitles,
167-
'description': sd.get('description'),
168-
'duration': int_or_none(sd.get('duration_in_seconds')),
169-
'upload_date': upload_date,
170-
'thumbnails': thumbnails,
171-
})
195+
playlist_count = len(data_jsb)
196+
image = self._og_search_thumbnail(webpage) if playlist_count == 1 else None
172197

198+
page_func = functools.partial(self._pagefunc, base_url, data_jsb, image=image)
173199
return {
174200
'_type': 'playlist',
175-
'entries': entries,
201+
'entries': InAdvancePagedList(page_func, playlist_count, 1),
176202
'id': playlist_id,
177203
}
178204

0 commit comments

Comments
 (0)