Compare commits

...

8 Commits

Author SHA1 Message Date
Paul Storkman
9cedfde981
Merge 716972da6b into f2a4983df7 2024-11-13 01:47:10 +01:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
Paul Storkman
716972da6b Formatting mistake 2024-10-31 14:31:23 +01:00
Paul Storkman
5c4df56d6d Start counting from zero. 2024-10-31 14:10:47 +01:00
Paul Storkman
9438d15dff Just return nothing on max retries, same as with extractor errors 2024-10-30 13:55:26 +01:00
Paul Storkman
87ad4d4774 Fix quotes 2024-10-29 16:02:57 +01:00
Paul Storkman
c6cc3a8ab2 Add option --wait-retries 2024-10-29 14:25:52 +01:00
6 changed files with 70 additions and 6 deletions

View File

@ -355,6 +355,10 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git
available. Pass the minimum number of available. Pass the minimum number of
seconds (or range) to wait between retries seconds (or range) to wait between retries
--no-wait-for-video Do not wait for scheduled streams (default) --no-wait-for-video Do not wait for scheduled streams (default)
--wait-retries RETRIES Number of retries while waiting for
scheduled streams to become available
(default is infinite). --wait-for-video must
also be set
--mark-watched Mark videos watched (even with --simulate) --mark-watched Mark videos watched (even with --simulate)
--no-mark-watched Do not mark videos watched (default) --no-mark-watched Do not mark videos watched (default)
--color [STREAM:]POLICY Whether to emit color codes in output, --color [STREAM:]POLICY Whether to emit color codes in output,

View File

@ -1620,17 +1620,26 @@ class YoutubeDL:
def _handle_extraction_exceptions(func): def _handle_extraction_exceptions(func):
@functools.wraps(func) @functools.wraps(func)
def wrapper(self, *args, **kwargs): def wrapper(self, *args, **kwargs):
wait_retries = 0
max_wait_retries = self.params.get('wait_retries')
while True: while True:
try: try:
return func(self, *args, **kwargs) return func(self, *args, **kwargs)
except (CookieLoadError, DownloadCancelled, LazyList.IndexError, PagedList.IndexError): except (CookieLoadError, DownloadCancelled, LazyList.IndexError, PagedList.IndexError):
raise raise
except ReExtractInfo as e: except ReExtractInfo as e:
if wait_retries >= max_wait_retries:
if max_wait_retries > 0:
self.report_error(f'Giving up after {wait_retries} {"retries" if wait_retries > 1 else "retry"} while waiting.')
else:
self.report_error('Video is still unavailable after waiting.')
return
if e.expected: if e.expected:
self.to_screen(f'{e}; Re-extracting data') self.to_screen(f'{e}; Re-extracting data')
else: else:
self.to_stderr('\r') self.to_stderr('\r')
self.report_warning(f'{e}; Re-extracting data') self.report_warning(f'{e}; Re-extracting data')
wait_retries += 1
continue continue
except GeoRestrictedError as e: except GeoRestrictedError as e:
msg = e.msg msg = e.msg

View File

@ -269,6 +269,7 @@ def validate_options(opts):
opts.retries = parse_retries('download', opts.retries) opts.retries = parse_retries('download', opts.retries)
opts.fragment_retries = parse_retries('fragment', opts.fragment_retries) opts.fragment_retries = parse_retries('fragment', opts.fragment_retries)
opts.wait_retries = parse_retries('waiting', opts.wait_retries)
opts.extractor_retries = parse_retries('extractor', opts.extractor_retries) opts.extractor_retries = parse_retries('extractor', opts.extractor_retries)
opts.file_access_retries = parse_retries('file access', opts.file_access_retries) opts.file_access_retries = parse_retries('file access', opts.file_access_retries)
@ -929,6 +930,7 @@ def parse_options(argv=None):
'extract_flat': opts.extract_flat, 'extract_flat': opts.extract_flat,
'live_from_start': opts.live_from_start, 'live_from_start': opts.live_from_start,
'wait_for_video': opts.wait_for_video, 'wait_for_video': opts.wait_for_video,
'wait_retries': opts.wait_retries,
'mark_watched': opts.mark_watched, 'mark_watched': opts.mark_watched,
'merge_output_format': opts.merge_output_format, 'merge_output_format': opts.merge_output_format,
'final_ext': final_ext, 'final_ext': final_ext,

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
}, },
}, },
], ],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}] }]
@staticmethod @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({ info['comments'].append({
'id': review.get('review_id'), 'id': review.get('review_id'),
'author': review.get('reviewer'), 'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')), 'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'}) 'parent': 'root'})

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get( return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj( dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str) vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url')))) mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info): def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story'] video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info) video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')): ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url: if not playable_url:
continue continue
if determine_ext(playable_url) == 'mpd': if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id)) formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(fmt_data, formats) extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats: if not formats:
# Do not append false positive entry w/o any formats # Do not append false positive entry w/o any formats
return return

View File

@ -442,6 +442,10 @@ def create_parser():
'--no-wait-for-video', '--no-wait-for-video',
dest='wait_for_video', action='store_const', const=None, dest='wait_for_video', action='store_const', const=None,
help='Do not wait for scheduled streams (default)') help='Do not wait for scheduled streams (default)')
general.add_option(
'--wait-retries',
dest='wait_retries', metavar='RETRIES', default='infinite',
help='Number of retries while waiting for scheduled streams to become available (default is %default). --wait-for-video must also be set')
general.add_option( general.add_option(
'--mark-watched', '--mark-watched',
action='store_true', dest='mark_watched', default=False, action='store_true', dest='mark_watched', default=False,