Compare commits

...

23 Commits

Author SHA1 Message Date
Riteo
1b39c4c782
Merge 1cae3bf46d into f2a4983df7 2024-11-13 02:33:12 +00:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
Riteo
1cae3bf46d Use unpack operator for files to delete 2024-11-08 03:52:50 +01:00
Riteo
4aa3c401d4 Do not pass -map -0:s multiple times 2024-11-08 03:49:39 +01:00
Riteo
0cc0f3f086 Merge remote-tracking branch 'origin/master' into json-subtitles 2024-11-08 03:44:09 +01:00
Riteo
85a844aef3 Select copy mode depending on extension 2024-09-11 11:43:33 +02:00
Riteo
17781f9d7d Remove debug thing
I'm dumb
2024-09-08 13:33:24 +02:00
Riteo
fc349670c3 Fix info attachment in subpaths 2024-09-08 13:30:35 +02:00
Riteo
4b5be635b1 Add missing comma (again)
oops
2024-09-08 13:30:35 +02:00
Riteo
45d1f2bb6c Fix attachments in subpaths 2024-09-08 13:30:32 +02:00
Riteo
7fb0c05ff6 Revert format check stuff 2024-09-08 13:04:59 +02:00
Riteo
aaa25eb508 Add missing trailing comma 2024-08-14 03:18:55 +02:00
Riteo
780bfd044f Pass target extension to all stream_copy_opts instances 2024-08-14 03:05:11 +02:00
Riteo
fe5de0005e Add extra checks for non-matroska formats when copying 2024-08-14 02:55:33 +02:00
Riteo
9db000a9af Check also if there are json subtitles 2024-08-14 02:55:29 +02:00
Riteo
62e274f515 Move regular subtitles options to their loop 2024-08-14 02:10:14 +02:00
Riteo
e202aae5d6 Remove redundant copy_unknown 2024-08-14 02:03:09 +02:00
Riteo
3b8050da5b Merge remote-tracking branch 'origin/master' into json-subtitles 2024-08-14 02:02:56 +02:00
Riteo
38a9f70044 Use a map for JSON sub handling instead of two lists 2024-08-14 01:16:15 +02:00
Riteo
550b3a046a Use the -copy_unknown flag in the stream copy otions
Also split the yield expression as the comment above was a bit
misleading (it was only related to the `-dn` flag).
2024-08-13 22:30:08 +02:00
Riteo
ba3a7232f0 [pp/FFmpegEmbedSubtitle] Embed JSON subtitles as Matroska attachments
Since we can't embed them as regular subtitles (due to them not having
any consistent structure), we embed them as file attachments, if
exporting as Matroska.

This allows us to have single-file downloads with everything embedded
for e.g. archival purposes.
2024-06-14 16:56:54 +02:00
Riteo
339828d777 [pp/FFmpegMetadata] Use metadata stream specifier for info.json
The old stream index specifiers would indiscriminately select any JSON
attachment, which made stuff like embedding live chat json data risky if
not impossible.

Also adds `-copy_unknown` as JSON data is "unknown" according to FFmpeg
(since it has no codec id) and thus would otherwise be rejected by
default.
2024-06-14 16:56:52 +02:00
3 changed files with 117 additions and 34 deletions

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
},
},
],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}]
@staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({
'id': review.get('review_id'),
'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'})

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats):
def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str)
vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest:
formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url'))))
mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = []
q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url:
continue
if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id))
formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else:
formats.append({
'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url,
})
extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats:
# Do not append false positive entry w/o any formats
return

View File

@ -220,9 +220,20 @@ class FFmpegPostProcessor(PostProcessor):
@staticmethod
def stream_copy_opts(copy=True, *, ext=None):
yield from ('-map', '0')
if ext in ('mkv', 'mka'):
# Some streams, such as JSON attachments, are considered of unknown
# type by FFmpeg but we still want to copy them.
yield '-copy_unknown'
else:
# Most containers don't really like unknown streams. Let's make
# sure to get rid of them.
yield '-ignore_unknown'
# Don't copy Apple TV chapters track, bin_data
# See https://github.com/yt-dlp/yt-dlp/issues/2, #19042, #19024, https://trac.ffmpeg.org/ticket/6016
yield from ('-dn', '-ignore_unknown')
yield '-dn'
if copy:
yield from ('-c', 'copy')
if ext in ('mp4', 'mov', 'm4a'):
@ -557,7 +568,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
@staticmethod
def _options(target_ext):
yield from FFmpegPostProcessor.stream_copy_opts(False)
yield from FFmpegPostProcessor.stream_copy_opts(False, ext=target_ext)
if target_ext == 'avi':
yield from ('-c:v', 'libxvid', '-vtag', 'XVID')
@ -583,7 +594,7 @@ class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP):
@staticmethod
def _options(target_ext):
return FFmpegPostProcessor.stream_copy_opts()
return FFmpegPostProcessor.stream_copy_opts(ext=target_ext)
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
@ -620,13 +631,18 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
webm_vtt_warn = False
mp4_ass_warn = False
json_subs = {}
for lang, sub_info in subtitles.items():
if not os.path.exists(sub_info.get('filepath', '')):
self.report_warning(f'Skipping embedding {lang} subtitle because the file is missing')
continue
sub_ext = sub_info['ext']
if sub_ext == 'json':
self.report_warning('JSON subtitles cannot be embedded')
if info['ext'] in ('mkv', 'mka'):
json_subs[lang] = sub_info['filepath']
else:
self.report_warning('JSON subtitles can only be embedded in mkv/mka files.')
elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
sub_langs.append(lang)
sub_names.append(sub_info.get('name'))
@ -639,31 +655,48 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
mp4_ass_warn = True
self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues')
if not sub_langs:
if not sub_langs and not json_subs:
return [], info
input_files = [filename, *sub_filenames]
opts = [
*self.stream_copy_opts(ext=info['ext']),
# Don't copy the existing subtitles, we may be running the
# postprocessor a second time
'-map', '-0:s',
]
opts = [*self.stream_copy_opts(ext=info['ext'])]
if sub_langs and sub_names:
# We have regular subtitles available to embed. Don't copy the
# existing subtitles, we may be running the postprocessor a second
# time.
opts.extend([
'-map', '-0:s',
])
for i, (lang, name) in enumerate(zip(sub_langs, sub_names)):
opts.extend(['-map', f'{i + 1}:0'])
lang_code = ISO639Utils.short2long(lang) or lang
opts.extend([f'-metadata:s:s:{i}', f'language={lang_code}'])
opts.extend([
'-map', f'{i + 1}:0',
f'-metadata:s:s:{i}', f'language={lang_code}',
])
if name:
opts.extend([f'-metadata:s:s:{i}', f'handler_name={name}',
f'-metadata:s:s:{i}', f'title={name}'])
for json_lang, json_filename in json_subs.items():
escaped_json_filename = self._ffmpeg_filename_argument(json_filename)
json_basename = os.path.basename(json_filename)
opts.extend([
'-map', f'-0:m:filename:{json_lang}.json?',
'-attach', escaped_json_filename,
f'-metadata:s:m:filename:{json_basename}', 'mimetype=application/json',
f'-metadata:s:m:filename:{json_basename}', f'filename={json_lang}.json',
])
temp_filename = prepend_extension(filename, 'temp')
self.to_screen(f'Embedding subtitles in "{filename}"')
self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
os.replace(temp_filename, filename)
files_to_delete = [] if self._already_have_subtitle else sub_filenames
files_to_delete = [] if self._already_have_subtitle else [*sub_filenames, *json_subs.values()]
return files_to_delete, info
@ -678,7 +711,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
@staticmethod
def _options(target_ext):
audio_only = target_ext == 'm4a'
yield from FFmpegPostProcessor.stream_copy_opts(not audio_only)
yield from FFmpegPostProcessor.stream_copy_opts(not audio_only, ext=target_ext)
if audio_only:
yield from ('-vn', '-acodec', 'copy')
@ -806,15 +839,20 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
write_json_file(self._downloader.sanitize_info(info, self.get_param('clean_infojson', True)), infofn)
info['infojson_filename'] = infofn
old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json')
if old_stream is not None:
yield ('-map', f'-0:{old_stream}')
new_stream -= 1
escaped_name = self._ffmpeg_filename_argument(infofn)
info_basename = os.path.basename(infofn)
yield (
'-attach', self._ffmpeg_filename_argument(infofn),
f'-metadata:s:{new_stream}', 'mimetype=application/json',
f'-metadata:s:{new_stream}', 'filename=info.json',
# In order to override any old info.json reliably we need to
# instruct FFmpeg to consider valid tracks without a codec id, like
# JSON attachments.
'-copy_unknown',
# This map operation allows us to actually replace any previous
# info.json data.
'-map', '-0:m:filename:info.json?',
'-attach', escaped_name,
f'-metadata:s:m:filename:{info_basename}', 'mimetype=application/json',
f'-metadata:s:m:filename:{info_basename}', 'filename=info.json',
)
@ -873,7 +911,7 @@ class FFmpegFixupStretchedPP(FFmpegFixupPostProcessor):
stretched_ratio = info.get('stretched_ratio')
if stretched_ratio not in (None, 1):
self._fixup('Fixing aspect ratio', info['filepath'], [
*self.stream_copy_opts(), '-aspect', f'{stretched_ratio:f}'])
*self.stream_copy_opts(ext=info['ext']), '-aspect', f'{stretched_ratio:f}'])
return [], info
@ -881,7 +919,7 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False, video=False)
def run(self, info):
if info.get('container') == 'm4a_dash':
self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(), '-f', 'mp4'])
self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(ext=info['ext']), '-f', 'mp4'])
return [], info
@ -904,7 +942,7 @@ class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
if self.get_audio_codec(info['filepath']) == 'aac':
args.extend(['-bsf:a', 'aac_adtstoasc'])
self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
*self.stream_copy_opts(), *args])
*self.stream_copy_opts(ext=info['ext']), *args])
return [], info
@ -925,7 +963,7 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor):
opts = ['-vf', 'setpts=PTS-STARTPTS']
else:
opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS']
self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False), '-ss', self.trim])
self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False, ext=info['ext']), '-ss', self.trim])
return [], info
@ -934,7 +972,7 @@ class FFmpegCopyStreamPP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, info):
self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts())
self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts(ext=info['ext']))
return [], info
@ -1063,7 +1101,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor):
self.to_screen(f'Splitting video by chapters; {len(chapters)} chapters found')
for idx, chapter in enumerate(chapters):
destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())])
self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts(ext=info['ext']))])
if in_file != info['filepath']:
self._delete_downloaded_files(in_file, msg=None)
return [], info