Compare commits

...

2 Commits

Author SHA1 Message Date
Kyle Gonsalves 1d851a6751 function for lambda 2024-04-23 16:15:22 -07:00
Kyle Gonsalves d5b48c06e6 address comments 2024-04-23 15:47:43 -07:00
1 changed files with 75 additions and 85 deletions

View File

@ -17,7 +17,6 @@
int_or_none,
join_nonempty,
js_to_json,
merge_dicts,
parse_duration,
parse_iso8601,
parse_qs,
@ -635,7 +634,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'ext': 'mp4',
'title': 'Germanwings crash site aerial video',
'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
'duration': None, # 47,
'duration': 47,
'timestamp': 1427219242,
'upload_date': '20150324',
'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
@ -675,7 +674,6 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'skip_download': True,
},
# TODO: now in .pageData.promo.media of SIMORGH_DATA
'skip': 'video extraction failed',
}, {
# single video from video playlist embedded with vxp-playlist-data JSON
'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
@ -696,11 +694,9 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
'info_dict': {
'id': 'p02q6gc4',
'ext': 'mp4',
# page title: 'Sri Lankas spicy secret',
'title': 'Tasting the spice of life in Jaffna',
# page description: 'As a new train line to Jaffna opens up the countrys north, travellers can experience a truly distinct slice of Tamil culture.',
'description': r're:(?s)BBC Travel Shows Henry Golding explores the city of Jaffna .{149} aftertaste\.$',
'timestamp': 1437935638, # was: 1437674293,
'timestamp': 1437935638,
'upload_date': '20150726',
'duration': 255,
},
@ -731,17 +727,17 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
},
'skip': 'Video no longer in page',
}, {
# single video in __INITIAL_DATA__ (was: playlist.sxml URL in playlist param)
# single video in __INITIAL_DATA__
'url': 'http://www.bbc.com/sport/0/football/33653409',
'info_dict': {
'id': 'p02xycnp',
'ext': 'mp4',
'title': 'Ronaldo to Man Utd, Arsenal to spend?',
'description': r'''re:(?s)BBC Sport's David Ornstein rounds up the latest transfer reports, .{359} here\.$''',
'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
'timestamp': 1437750175,
'upload_date': '20150724',
'thumbnail': 'https://news.bbcimg.co.uk/media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
'duration': None, # 140,
'duration': 140,
},
}, {
# article with multiple videos embedded with Morph.setPayload
@ -753,7 +749,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
},
'playlist_count': 3,
}, {
# lead item from above playlist
# Testing noplaylist
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': 'p034ppnv',
@ -797,7 +793,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
# video with window.__INITIAL_DATA__ and value as JSON string
'url': 'https://www.bbc.com/news/av/world-europe-59468682',
'info_dict': {
'id': 'p0b779gc', # was 'p0b71qth',
'id': 'p0b779gc',
'ext': 'mp4',
'title': 'Why France is making this woman a national hero',
'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{291} Casseville$',
@ -1038,8 +1034,8 @@ def _real_extract(self, url):
webpage, 'group id', default=None)
if group_id:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id,
ie=BBCCoUkIE.ie_key())
f'https://www.bbc.co.uk/programmes/{group_id}',
ie=BBCCoUkIE)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
@ -1106,17 +1102,15 @@ def _real_extract(self, url):
contains_pattern=r'\{(?:(?!</script>)[\s\S])+?(?:"leadMedia"|\\"videoData\\")\s*:(?:(?!</script>)[\s\S])+\}',
default={})
if morph_payload:
for component in traverse_obj(morph_payload, (
'body', 'components', lambda _, v: v['props']['leadMedia']['identifiers'])):
lead_media = component['props']['leadMedia']
programme_id = traverse_obj(lead_media['identifiers'], 'vpid', 'playablePid', expected_type=str)
for lead_media in traverse_obj(morph_payload, (
'body', 'components', ..., 'props', 'leadMedia', {dict})):
programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
if not programme_id:
continue
title = lead_media.get('title') or self._og_search_title(webpage)
formats, subtitles = self._download_media_selector(programme_id)
return {
'id': programme_id,
'title': title,
'title': lead_media.get('title') or self._og_search_title(webpage),
**traverse_obj(lead_media, {
'description': ('summary', {str}),
'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
@ -1126,11 +1120,9 @@ def _real_extract(self, url):
'formats': formats,
'subtitles': subtitles,
}
body = traverse_obj(morph_payload, (
'body', 'content', 'article', 'body',
{lambda s: self._parse_json(s, playlist_id, fatal=False)}))
added = False
for video_data in traverse_obj(body, (Ellipsis, 'videoData', {lambda v: v.get('pid') and v})):
body = self._parse_json(traverse_obj(morph_payload, (
'body', 'content', 'article', 'body')), playlist_id, fatal=False)
for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
if video_data.get('vpid'):
video_id = video_data['vpid']
formats, subtitles = self._download_media_selector(video_id)
@ -1142,22 +1134,24 @@ def _real_extract(self, url):
else:
video_id = video_data['pid']
entry = self.url_result(
'https://www.bbc.co.uk/programmes/%s' % video_id, BBCCoUkIE.ie_key(),
f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
video_id, url_transparent=True)
entry = merge_dicts(
traverse_obj(morph_payload, (
entry = {
**traverse_obj(morph_payload, (
'body', 'content', 'article', {
'timestamp': ('dateTimeInfo', 'dateTime', {parse_iso8601}),
})), traverse_obj(video_data, {
'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
'title': (('title', 'caption'), {str}, any),
'duration': ('duration', {parse_duration}),
}), entry)
}
)),
**traverse_obj(video_data, {
'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
'title': (('title', 'caption'), {str}, any),
'duration': ('duration', {parse_duration}),
}),
}
if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
return entry
entries.append(entry)
added = True
if added:
if entries:
playlist_title = traverse_obj(morph_payload, (
'body', 'content', 'article', 'headline', {str})) or playlist_title
return self.playlist_result(
@ -1168,36 +1162,34 @@ def _real_extract(self, url):
r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
'preload state', playlist_id, transform_source=js_to_json, default={})
# PRELOADED_STATE with current programmme
current_programme = traverse_obj(preload_state, (
'programmes', 'current', {dict}))
if current_programme:
programme_id = traverse_obj(current_programme, ('id', {str}))
if programme_id and current_programme.get('type') == 'playable_item':
title = traverse_obj(current_programme, ('titles', 'tertiary', {str})) or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
return {
'id': programme_id,
'title': title,
'formats': formats,
**traverse_obj(current_programme, {
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
'duration': ('duration', 'value', {int_or_none}),
'uploader': ('network', 'short_title', {str}),
'uploader_id': ('network', 'id', {str}),
}),
'subtitles': subtitles,
**traverse_obj(preload_state, {
'chapters': (
'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
}
)
}),
}
current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
programme_id = traverse_obj(current_programme, ('id', {str}))
if programme_id and current_programme.get('type') == 'playable_item':
title = traverse_obj(current_programme, ('titles', 'tertiary', {str})) or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
return {
'id': programme_id,
'title': title,
'formats': formats,
**traverse_obj(current_programme, {
'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
'duration': ('duration', 'value', {int_or_none}),
'uploader': ('network', 'short_title', {str}),
'uploader_id': ('network', 'id', {str}),
}),
'subtitles': subtitles,
**traverse_obj(preload_state, {
'chapters': (
'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
'title': ('titles', {lambda x: join_nonempty(
'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
'start_time': ('offset', 'start', {float_or_none}),
'end_time': ('offset', 'end', {float_or_none}),
}
)
}),
}
# PWA_PRELOADED_STATE with article video asset
asset_id = traverse_obj(preload_state, (
@ -1231,7 +1223,7 @@ def image_url(image_id):
}
else:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % asset_id, BBCCoUkIE.ie_key(),
f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
asset_id, playlist_title, display_id=playlist_id,
description=playlist_description)
@ -1282,7 +1274,7 @@ def image_url(image_id):
k_int_or_none = functools.partial(int_or_none, scale=1000)
def parse_model(model):
'''Extract single video from model structure'''
"""Extract single video from model structure"""
item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
if not item_id:
return
@ -1301,6 +1293,9 @@ def parse_model(model):
})
}
def is_type(*types):
return lambda _, v: v['type'] in types
initial_data = self._search_regex(
r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
'quoted preload state', default=None)
@ -1312,18 +1307,16 @@ def parse_model(model):
initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
if initial_data:
added = False
for video_data in traverse_obj(initial_data, (
'stores', 'article', 'articleBodyContent', lambda _, v: v['type'] == 'video')):
'stores', 'article', 'articleBodyContent', is_type('video'))):
model = traverse_obj(video_data, (
'model', 'blocks', lambda _, v: v['type'] == 'aresMedia',
'model', 'blocks', lambda _, v: v['type'] == 'aresMediaMetadata',
'model', 'blocks', is_type('aresMedia'),
'model', 'blocks', is_type('aresMediaMetadata'),
'model', {dict}, any))
entry = parse_model(model)
if entry:
entries.append(entry)
added = True
if added:
if entries:
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
@ -1361,15 +1354,13 @@ def parse_media(media):
'description': strip_or_none(item_desc),
})
for resp in traverse_obj(initial_data, ('data', lambda _, v: v.get('name'))):
name = resp['name']
for resp in (initial_data.get('data') or {}).values():
name = resp.get('name')
if name == 'media-experience':
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
elif name == 'article':
for block in traverse_obj(resp, ('data', (
None, ('content', 'model')), 'blocks',
for block in traverse_obj(resp, (
'data', (None, ('content', 'model')), 'blocks',
lambda _, v: v.get('type') in {'media', 'video'},
'model', {dict})):
parse_media(block)
@ -1383,19 +1374,18 @@ def extract_all(pattern):
# US accessed article with single embedded video (e.g.
# https://www.bbc.com/news/uk-68546268)
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), (
'props', 'pageProps', 'page'))
next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
('props', 'pageProps', 'page'))
model = traverse_obj(next_data, (
..., 'contents', lambda _, v: v['type'] == 'video',
'model', 'blocks', lambda _, v: v['type'] == 'media',
'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata',
..., 'contents', is_type('video'),
'model', 'blocks', is_type('media'),
'model', 'blocks', is_type('mediaMetadata'),
'model', {dict}, any))
if model:
entry = parse_model(model)
if entry:
if entry.get('timestamp') is None:
if entry := parse_model(model):
if not entry.get('timestamp'):
entry['timestamp'] = traverse_obj(next_data, (
..., 'contents', lambda _, v: v['type'] == 'timestamp',
..., 'contents', is_type('timestamp'),
'model', 'timestamp', {k_int_or_none}, any))
entries.append(entry)
return self.playlist_result(