Compare commits

...

25 Commits

Author SHA1 Message Date
Riteo
9b676aaaf2
Merge 1cae3bf46d into c699bafc50 2024-11-16 01:03:02 +00:00
bashonly
c699bafc50 [ie/soop] Fix thumbnail extraction (#11545)
Closes #11537

Authored by: bashonly
2024-11-15 22:51:55 +00:00
bashonly
eb64ae7d5d [ie] Allow ext override for thumbnails (#11545)
Authored by: bashonly
2024-11-15 22:51:55 +00:00
Simon Sawicki
c014fbcddc
[utils] subs_list_to_dict: Add lang default parameter (#11508)
Authored by: Grub4K
2024-11-15 23:25:52 +01:00
Simon Sawicki
39d79c9b9c
[utils] Fix join_nonempty, add **kwargs to unpack (#11559)
Authored by: Grub4K
2024-11-15 22:06:15 +01:00
Riteo
1cae3bf46d Use unpack operator for files to delete 2024-11-08 03:52:50 +01:00
Riteo
4aa3c401d4 Do not pass -map -0:s multiple times 2024-11-08 03:49:39 +01:00
Riteo
0cc0f3f086 Merge remote-tracking branch 'origin/master' into json-subtitles 2024-11-08 03:44:09 +01:00
Riteo
85a844aef3 Select copy mode depending on extension 2024-09-11 11:43:33 +02:00
Riteo
17781f9d7d Remove debug thing
I'm dumb
2024-09-08 13:33:24 +02:00
Riteo
fc349670c3 Fix info attachment in subpaths 2024-09-08 13:30:35 +02:00
Riteo
4b5be635b1 Add missing comma (again)
oops
2024-09-08 13:30:35 +02:00
Riteo
45d1f2bb6c Fix attachments in subpaths 2024-09-08 13:30:32 +02:00
Riteo
7fb0c05ff6 Revert format check stuff 2024-09-08 13:04:59 +02:00
Riteo
aaa25eb508 Add missing trailing comma 2024-08-14 03:18:55 +02:00
Riteo
780bfd044f Pass target extension to all stream_copy_opts instances 2024-08-14 03:05:11 +02:00
Riteo
fe5de0005e Add extra checks for non-matroska formats when copying 2024-08-14 02:55:33 +02:00
Riteo
9db000a9af Check also if there are json subtitles 2024-08-14 02:55:29 +02:00
Riteo
62e274f515 Move regular subtitles options to their loop 2024-08-14 02:10:14 +02:00
Riteo
e202aae5d6 Remove redundant copy_unknown 2024-08-14 02:03:09 +02:00
Riteo
3b8050da5b Merge remote-tracking branch 'origin/master' into json-subtitles 2024-08-14 02:02:56 +02:00
Riteo
38a9f70044 Use a map for JSON sub handling instead of two lists 2024-08-14 01:16:15 +02:00
Riteo
550b3a046a Use the -copy_unknown flag in the stream copy otions
Also split the yield expression as the comment above was a bit
misleading (it was only related to the `-dn` flag).
2024-08-13 22:30:08 +02:00
Riteo
ba3a7232f0 [pp/FFmpegEmbedSubtitle] Embed JSON subtitles as Matroska attachments
Since we can't embed them as regular subtitles (due to them not having
any consistent structure), we embed them as file attachments, if
exporting as Matroska.

This allows us to have single-file downloads with everything embedded
for e.g. archival purposes.
2024-06-14 16:56:54 +02:00
Riteo
339828d777 [pp/FFmpegMetadata] Use metadata stream specifier for info.json
The old stream index specifiers would indiscriminately select any JSON
attachment, which made stuff like embedding live chat json data risky if
not impossible.

Also adds `-copy_unknown` as JSON data is "unknown" according to FFmpeg
(since it has no codec id) and thus would otherwise be rejected by
default.
2024-06-14 16:56:52 +02:00
8 changed files with 148 additions and 52 deletions

View File

@ -481,7 +481,7 @@ class TestTraversalHelpers:
'id': 'name',
'data': 'content',
'url': 'url',
}, all, {subs_list_to_dict}]) == {
}, all, {subs_list_to_dict(lang=None)}]) == {
'de': [{'url': 'https://example.com/subs/de.ass'}],
'en': [{'data': 'content'}],
}, 'subs with mandatory items missing should be filtered'
@ -507,6 +507,54 @@ class TestTraversalHelpers:
{'url': 'https://example.com/subs/en1', 'ext': 'ext'},
{'url': 'https://example.com/subs/en2', 'ext': 'ext'},
]}, '`quality` key should sort subtitle list accordingly'
assert traverse_obj([
{'name': 'de', 'url': 'https://example.com/subs/de.ass'},
{'name': 'de'},
{'name': 'en', 'content': 'content'},
{'url': 'https://example.com/subs/en'},
], [..., {
'id': 'name',
'url': 'url',
'data': 'content',
}, all, {subs_list_to_dict(lang='en')}]) == {
'de': [{'url': 'https://example.com/subs/de.ass'}],
'en': [
{'data': 'content'},
{'url': 'https://example.com/subs/en'},
],
}, 'optionally provided lang should be used if no id available'
assert traverse_obj([
{'name': 1, 'url': 'https://example.com/subs/de1'},
{'name': {}, 'url': 'https://example.com/subs/de2'},
{'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
{'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
], [..., {
'id': 'name',
'url': 'url',
'ext': 'ext',
}, all, {subs_list_to_dict(lang=None)}]) == {
'de': [
{'url': 'https://example.com/subs/de3'},
{'url': 'https://example.com/subs/de4'},
],
}, 'non str types should be ignored for id and ext'
assert traverse_obj([
{'name': 1, 'url': 'https://example.com/subs/de1'},
{'name': {}, 'url': 'https://example.com/subs/de2'},
{'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
{'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
], [..., {
'id': 'name',
'url': 'url',
'ext': 'ext',
}, all, {subs_list_to_dict(lang='de')}]) == {
'de': [
{'url': 'https://example.com/subs/de1'},
{'url': 'https://example.com/subs/de2'},
{'url': 'https://example.com/subs/de3'},
{'url': 'https://example.com/subs/de4'},
],
}, 'non str types should be replaced by default id'
def test_trim_str(self):
with pytest.raises(TypeError):
@ -525,7 +573,7 @@ class TestTraversalHelpers:
def test_unpack(self):
assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123'
assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3'
assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3'
assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3'
with pytest.raises(TypeError):
unpack(join_nonempty)()
with pytest.raises(TypeError):

View File

@ -72,7 +72,6 @@ from yt_dlp.utils import (
intlist_to_bytes,
iri_to_uri,
is_html,
join_nonempty,
js_to_json,
limit_length,
locked_file,
@ -2158,10 +2157,6 @@ Line 1
assert int_or_none(v=10) == 10, 'keyword passed positional should call function'
assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function'
assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially'
assert callable(join_nonempty()), 'varargs positional should apply partially'
assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function'
if __name__ == '__main__':
unittest.main()

View File

@ -4381,7 +4381,9 @@ class YoutubeDL:
return None
for idx, t in list(enumerate(thumbnails))[::-1]:
thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg')
if multiple:
thumb_ext = f'{t["id"]}.{thumb_ext}'
thumb_display_id = f'{label} thumbnail {t["id"]}'
thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))

View File

@ -66,6 +66,14 @@ class AfreecaTVBaseIE(InfoExtractor):
extensions={'legacy_ssl': True}), display_id,
'Downloading API JSON', 'Unable to download API JSON')
@staticmethod
def _fixup_thumb(thumb_url):
if not url_or_none(thumb_url):
return None
# Core would determine_ext as 'php' from the url, so we need to provide the real ext
# See: https://github.com/yt-dlp/yt-dlp/issues/11537
return [{'url': thumb_url, 'ext': 'jpg'}]
class AfreecaTVIE(AfreecaTVBaseIE):
IE_NAME = 'soop'
@ -155,7 +163,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'uploader': ('writer_nick', {str}),
'uploader_id': ('bj_id', {str}),
'duration': ('total_file_duration', {int_or_none(scale=1000)}),
'thumbnail': ('thumb', {url_or_none}),
'thumbnails': ('thumb', {self._fixup_thumb}),
})
entries = []
@ -226,8 +234,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
return self.playlist_result(self._entries(data), video_id)
@staticmethod
def _entries(data):
def _entries(self, data):
# 'files' is always a list with 1 element
yield from traverse_obj(data, (
'data', lambda _, v: v['story_type'] == 'catch',
@ -238,7 +245,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
'title': ('title', {str}),
'uploader': ('writer_nick', {str}),
'uploader_id': ('writer_id', {str}),
'thumbnail': ('thumb', {url_or_none}),
'thumbnails': ('thumb', {self._fixup_thumb}),
'timestamp': ('write_timestamp', {int_or_none}),
}))

View File

@ -279,6 +279,7 @@ class InfoExtractor:
thumbnails: A list of dictionaries, with the following entries:
* "id" (optional, string) - Thumbnail format ID
* "url"
* "ext" (optional, string) - actual image extension if not given in URL
* "preference" (optional, int) - quality of the image
* "width" (optional, int)
* "height" (optional, int)

View File

@ -220,9 +220,20 @@ class FFmpegPostProcessor(PostProcessor):
@staticmethod
def stream_copy_opts(copy=True, *, ext=None):
yield from ('-map', '0')
if ext in ('mkv', 'mka'):
# Some streams, such as JSON attachments, are considered of unknown
# type by FFmpeg but we still want to copy them.
yield '-copy_unknown'
else:
# Most containers don't really like unknown streams. Let's make
# sure to get rid of them.
yield '-ignore_unknown'
# Don't copy Apple TV chapters track, bin_data
# See https://github.com/yt-dlp/yt-dlp/issues/2, #19042, #19024, https://trac.ffmpeg.org/ticket/6016
yield from ('-dn', '-ignore_unknown')
yield '-dn'
if copy:
yield from ('-c', 'copy')
if ext in ('mp4', 'mov', 'm4a'):
@ -557,7 +568,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
@staticmethod
def _options(target_ext):
yield from FFmpegPostProcessor.stream_copy_opts(False)
yield from FFmpegPostProcessor.stream_copy_opts(False, ext=target_ext)
if target_ext == 'avi':
yield from ('-c:v', 'libxvid', '-vtag', 'XVID')
@ -583,7 +594,7 @@ class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP):
@staticmethod
def _options(target_ext):
return FFmpegPostProcessor.stream_copy_opts()
return FFmpegPostProcessor.stream_copy_opts(ext=target_ext)
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
@ -620,13 +631,18 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
webm_vtt_warn = False
mp4_ass_warn = False
json_subs = {}
for lang, sub_info in subtitles.items():
if not os.path.exists(sub_info.get('filepath', '')):
self.report_warning(f'Skipping embedding {lang} subtitle because the file is missing')
continue
sub_ext = sub_info['ext']
if sub_ext == 'json':
self.report_warning('JSON subtitles cannot be embedded')
if info['ext'] in ('mkv', 'mka'):
json_subs[lang] = sub_info['filepath']
else:
self.report_warning('JSON subtitles can only be embedded in mkv/mka files.')
elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
sub_langs.append(lang)
sub_names.append(sub_info.get('name'))
@ -639,31 +655,48 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
mp4_ass_warn = True
self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues')
if not sub_langs:
if not sub_langs and not json_subs:
return [], info
input_files = [filename, *sub_filenames]
opts = [
*self.stream_copy_opts(ext=info['ext']),
# Don't copy the existing subtitles, we may be running the
# postprocessor a second time
'-map', '-0:s',
]
opts = [*self.stream_copy_opts(ext=info['ext'])]
if sub_langs and sub_names:
# We have regular subtitles available to embed. Don't copy the
# existing subtitles, we may be running the postprocessor a second
# time.
opts.extend([
'-map', '-0:s',
])
for i, (lang, name) in enumerate(zip(sub_langs, sub_names)):
opts.extend(['-map', f'{i + 1}:0'])
lang_code = ISO639Utils.short2long(lang) or lang
opts.extend([f'-metadata:s:s:{i}', f'language={lang_code}'])
opts.extend([
'-map', f'{i + 1}:0',
f'-metadata:s:s:{i}', f'language={lang_code}',
])
if name:
opts.extend([f'-metadata:s:s:{i}', f'handler_name={name}',
f'-metadata:s:s:{i}', f'title={name}'])
for json_lang, json_filename in json_subs.items():
escaped_json_filename = self._ffmpeg_filename_argument(json_filename)
json_basename = os.path.basename(json_filename)
opts.extend([
'-map', f'-0:m:filename:{json_lang}.json?',
'-attach', escaped_json_filename,
f'-metadata:s:m:filename:{json_basename}', 'mimetype=application/json',
f'-metadata:s:m:filename:{json_basename}', f'filename={json_lang}.json',
])
temp_filename = prepend_extension(filename, 'temp')
self.to_screen(f'Embedding subtitles in "{filename}"')
self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
os.replace(temp_filename, filename)
files_to_delete = [] if self._already_have_subtitle else sub_filenames
files_to_delete = [] if self._already_have_subtitle else [*sub_filenames, *json_subs.values()]
return files_to_delete, info
@ -678,7 +711,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
@staticmethod
def _options(target_ext):
audio_only = target_ext == 'm4a'
yield from FFmpegPostProcessor.stream_copy_opts(not audio_only)
yield from FFmpegPostProcessor.stream_copy_opts(not audio_only, ext=target_ext)
if audio_only:
yield from ('-vn', '-acodec', 'copy')
@ -806,15 +839,20 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
write_json_file(self._downloader.sanitize_info(info, self.get_param('clean_infojson', True)), infofn)
info['infojson_filename'] = infofn
old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json')
if old_stream is not None:
yield ('-map', f'-0:{old_stream}')
new_stream -= 1
escaped_name = self._ffmpeg_filename_argument(infofn)
info_basename = os.path.basename(infofn)
yield (
'-attach', self._ffmpeg_filename_argument(infofn),
f'-metadata:s:{new_stream}', 'mimetype=application/json',
f'-metadata:s:{new_stream}', 'filename=info.json',
# In order to override any old info.json reliably we need to
# instruct FFmpeg to consider valid tracks without a codec id, like
# JSON attachments.
'-copy_unknown',
# This map operation allows us to actually replace any previous
# info.json data.
'-map', '-0:m:filename:info.json?',
'-attach', escaped_name,
f'-metadata:s:m:filename:{info_basename}', 'mimetype=application/json',
f'-metadata:s:m:filename:{info_basename}', 'filename=info.json',
)
@ -873,7 +911,7 @@ class FFmpegFixupStretchedPP(FFmpegFixupPostProcessor):
stretched_ratio = info.get('stretched_ratio')
if stretched_ratio not in (None, 1):
self._fixup('Fixing aspect ratio', info['filepath'], [
*self.stream_copy_opts(), '-aspect', f'{stretched_ratio:f}'])
*self.stream_copy_opts(ext=info['ext']), '-aspect', f'{stretched_ratio:f}'])
return [], info
@ -881,7 +919,7 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False, video=False)
def run(self, info):
if info.get('container') == 'm4a_dash':
self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(), '-f', 'mp4'])
self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(ext=info['ext']), '-f', 'mp4'])
return [], info
@ -904,7 +942,7 @@ class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
if self.get_audio_codec(info['filepath']) == 'aac':
args.extend(['-bsf:a', 'aac_adtstoasc'])
self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
*self.stream_copy_opts(), *args])
*self.stream_copy_opts(ext=info['ext']), *args])
return [], info
@ -925,7 +963,7 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor):
opts = ['-vf', 'setpts=PTS-STARTPTS']
else:
opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS']
self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False), '-ss', self.trim])
self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False, ext=info['ext']), '-ss', self.trim])
return [], info
@ -934,7 +972,7 @@ class FFmpegCopyStreamPP(FFmpegFixupPostProcessor):
@PostProcessor._restrict_to(images=False)
def run(self, info):
self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts())
self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts(ext=info['ext']))
return [], info
@ -1063,7 +1101,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor):
self.to_screen(f'Splitting video by chapters; {len(chapters)} chapters found')
for idx, chapter in enumerate(chapters):
destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())])
self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts(ext=info['ext']))])
if in_file != info['filepath']:
self._delete_downloaded_files(in_file, msg=None)
return [], info

View File

@ -216,7 +216,7 @@ def partial_application(func):
sig = inspect.signature(func)
required_args = [
param.name for param in sig.parameters.values()
if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
if param.default is inspect.Parameter.empty
]
@ -4837,7 +4837,6 @@ def number_of_digits(number):
return len('%d' % number)
@partial_application
def join_nonempty(*values, delim='-', from_dict=None):
if from_dict is not None:
values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)

View File

@ -332,14 +332,14 @@ class _RequiredError(ExtractorError):
@typing.overload
def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
def subs_list_to_dict(*, lang: str | None = 'und', ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
@typing.overload
def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ...
def subs_list_to_dict(subs: list[dict] | None, /, *, lang: str | None = 'und', ext: str | None = None) -> dict[str, list[dict]]: ...
def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None):
"""
Convert subtitles from a traversal into a subtitle dict.
The path should have an `all` immediately before this function.
@ -352,7 +352,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
`quality` The sort order for each subtitle
"""
if subs is None:
return functools.partial(subs_list_to_dict, ext=ext)
return functools.partial(subs_list_to_dict, lang=lang, ext=ext)
result = collections.defaultdict(list)
@ -360,10 +360,16 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
if not url_or_none(sub.get('url')) and not sub.get('data'):
continue
sub_id = sub.pop('id', None)
if sub_id is None:
continue
if ext is not None and not sub.get('ext'):
sub['ext'] = ext
if not isinstance(sub_id, str):
if not lang:
continue
sub_id = lang
sub_ext = sub.get('ext')
if not isinstance(sub_ext, str):
if not ext:
sub.pop('ext', None)
else:
sub['ext'] = ext
result[sub_id].append(sub)
result = dict(result)
@ -452,9 +458,9 @@ def trim_str(*, start=None, end=None):
return trim
def unpack(func):
def unpack(func, **kwargs):
@functools.wraps(func)
def inner(items, **kwargs):
def inner(items):
return func(*items, **kwargs)
return inner