Compare commits

...

6 Commits

Author SHA1 Message Date
InvalidUsernameException
d73473d35c
Merge 180d2d1a9b into 39d79c9b9c 2024-11-15 22:52:15 +01:00
InvalidUsernameException
180d2d1a9b Update outdated API URL 2024-09-19 21:50:18 +02:00
InvalidUsernameException
c6a3a9b246 Fix tests 2024-09-19 21:50:18 +02:00
InvalidUsernameException
44f8f59c88 Extract video entry info 2024-09-19 21:50:18 +02:00
InvalidUsernameException
6b6f97f3c9 Extract playlist metadata 2024-09-19 21:50:18 +02:00
InvalidUsernameException
8c27ce471d Rewrite ZDF channel extractor to use an API instead of web scraping 2024-09-19 21:50:18 +02:00

View File

@ -5,7 +5,6 @@ from ..utils import (
NO_DEFAULT,
ExtractorError,
determine_ext,
extract_attributes,
float_or_none,
int_or_none,
join_nonempty,
@ -25,6 +24,11 @@ class ZDFBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['DE']
_QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd', 'fhd', 'uhd')
def _download_mediathekv2_document(self, document_id):
return self._download_json(
f'https://zdf-prod-futura.zdf.de/mediathekV2/document/{document_id}',
document_id)
def _call_api(self, url, video_id, item, api_token=None, referrer=None):
headers = {}
if api_token:
@ -320,9 +324,7 @@ class ZDFIE(ZDFBaseIE):
return self._extract_entry(player['content'], player, content, video_id)
def _extract_mobile(self, video_id):
video = self._download_json(
f'https://zdf-cdn.live.cellular.de/mediathekV2/document/{video_id}',
video_id)
video = self._download_mediathekv2_document(video_id)
formats = []
formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list)
@ -387,18 +389,19 @@ class ZDFChannelIE(ZDFBaseIE):
'info_dict': {
'id': 'planet-e',
'title': 'planet e.',
'description': 'md5:87e3b9c66a63cf1407ee443d2c4eb88e',
},
'playlist_mincount': 50,
}, {
'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest',
'info_dict': {
'id': 'aktenzeichen-xy-ungeloest',
'title': 'Aktenzeichen XY... ungelöst',
'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)",
'title': 'Aktenzeichen XY... Ungelöst',
'description': 'md5:623ede5819c400c6d04943fa8100e6e7',
},
'playlist_mincount': 2,
}, {
'url': 'https://www.zdf.de/filme/taunuskrimi/',
'url': 'https://www.zdf.de/serien/taunuskrimi/',
'only_matching': True,
}]
@ -410,32 +413,72 @@ class ZDFChannelIE(ZDFBaseIE):
title = super()._og_search_title(webpage, fatal=fatal)
return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None
def _extract_document_id(self, webpage):
matches = re.search(r'docId\s*:\s*[\'"](?P<docid>[^\'"]+)[\'"]', webpage)
return matches and matches.group('docid')
def _get_playlist_description(self, page_data):
headline = traverse_obj(page_data, ('shortText', 'headline'))
text = traverse_obj(page_data, ('shortText', 'text'))
if headline is not None and text is not None:
return f'{headline}\n\n{text}'
return headline or text
def _convert_thumbnails(self, thumbnails):
return [{
'id': key,
'url': thumbnail_info['url'],
'width': int_or_none(thumbnail_info.get('width')),
'height': int_or_none(thumbnail_info.get('height')),
} for key, thumbnail_info in thumbnails.items() if url_or_none(thumbnail_info.get('url'))]
def _teaser_to_url_result(self, teaser):
return self.url_result(
teaser['sharingUrl'], ie=ZDFIE.ie_key(),
id=teaser.get('id'), title=teaser.get('titel', ''),
thumbnails=self._convert_thumbnails(teaser.get('teaserBild', {})),
description=teaser.get('beschreibung'),
duration=float_or_none(teaser.get('length')),
media_type=teaser.get('currentVideoType') or teaser.get('contentType'),
season_number=int_or_none(teaser.get('seasonNumber')),
episode_number=int_or_none(teaser.get('episodeNumber')))
def _real_extract(self, url):
channel_id = self._match_id(url)
webpage = self._download_webpage(url, channel_id)
matches = re.finditer(
rf'''<div\b[^>]*?\sdata-plusbar-id\s*=\s*(["'])(?P<p_id>[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P<url>{ZDFIE._VALID_URL})\1''',
webpage)
main_video = None
playlist_videos = []
document_id = self._extract_document_id(webpage)
if document_id is not None:
data = self._download_mediathekv2_document(document_id)
for cluster in data['cluster']:
for teaser in cluster['teaser']:
if cluster['type'] == 'teaserContent' and teaser['type'] == 'video':
main_video = main_video or teaser
elif cluster['type'] == 'teaser' and teaser['type'] == 'video':
if teaser['brandId'] != document_id:
# These are unrelated 'You might also like' videos, filter them out
continue
playlist_videos.append(teaser)
if self._downloader.params.get('noplaylist', False):
entry = next(
(self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches),
None)
self.to_screen('Downloading just the main video because of --no-playlist')
if entry:
return entry
else:
self.to_screen(f'Downloading playlist {channel_id} - add --no-playlist to download just the main video')
return self._teaser_to_url_result(main_video) if main_video else None
def check_video(m):
v_ref = self._search_regex(
r'''(<a\b[^>]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["']){}\2[^>]*>)'''.format(m.group('p_id')),
webpage, 'check id', default='')
v_ref = extract_attributes(v_ref)
return v_ref.get('data-target-video-type') != 'novideo'
self.to_screen(f'Downloading playlist {channel_id} - add --no-playlist to download just the main video')
return self.playlist_from_matches(
(m.group('url') for m in matches if check_video(m)),
channel_id, self._og_search_title(webpage, fatal=False))
thumbnails = (
traverse_obj(data, ('document', 'image'))
or traverse_obj(data, ('document', 'teaserBild'))
or traverse_obj(data, ('stageHeader', 'image'))
or {})
return self.playlist_result(
(self._teaser_to_url_result(video) for video in playlist_videos),
playlist_id=channel_id,
playlist_title=self._og_search_title(webpage, fatal=False),
description=self._get_playlist_description(data),
thumbnails=self._convert_thumbnails(thumbnails))