Merge 1cae3bf46d into c699bafc50

[ie/soop] Fix thumbnail extraction (#11545 )
Closes #11537 Authored by: bashonly
2024-11-26 01:01:25 +01:00 · 2024-11-16 01:03:02 +00:00 · 2024-11-15 22:51:55 +00:00 · 2024-11-15 22:51:55 +00:00 · 2024-11-15 23:25:52 +01:00 · 2024-11-15 22:06:15 +01:00
8 changed files with 148 additions and 52 deletions
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
@ -481,7 +481,7 @@ class TestTraversalHelpers:
            'id': 'name',
            'data': 'content',
            'url': 'url',
-        }, all, {subs_list_to_dict}]) == {
+        }, all, {subs_list_to_dict(lang=None)}]) == {
            'de': [{'url': 'https://example.com/subs/de.ass'}],
            'en': [{'data': 'content'}],
        }, 'subs with mandatory items missing should be filtered'
@ -507,6 +507,54 @@ class TestTraversalHelpers:
            {'url': 'https://example.com/subs/en1', 'ext': 'ext'},
            {'url': 'https://example.com/subs/en2', 'ext': 'ext'},
        ]}, '`quality` key should sort subtitle list accordingly'
        assert traverse_obj([
            {'name': 'de', 'url': 'https://example.com/subs/de.ass'},
            {'name': 'de'},
            {'name': 'en', 'content': 'content'},
            {'url': 'https://example.com/subs/en'},
        ], [..., {
            'id': 'name',
            'url': 'url',
            'data': 'content',
        }, all, {subs_list_to_dict(lang='en')}]) == {
            'de': [{'url': 'https://example.com/subs/de.ass'}],
            'en': [
                {'data': 'content'},
                {'url': 'https://example.com/subs/en'},
            ],
        }, 'optionally provided lang should be used if no id available'
        assert traverse_obj([
            {'name': 1, 'url': 'https://example.com/subs/de1'},
            {'name': {}, 'url': 'https://example.com/subs/de2'},
            {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
            {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
        ], [..., {
            'id': 'name',
            'url': 'url',
            'ext': 'ext',
        }, all, {subs_list_to_dict(lang=None)}]) == {
            'de': [
                {'url': 'https://example.com/subs/de3'},
                {'url': 'https://example.com/subs/de4'},
            ],
        }, 'non str types should be ignored for id and ext'
        assert traverse_obj([
            {'name': 1, 'url': 'https://example.com/subs/de1'},
            {'name': {}, 'url': 'https://example.com/subs/de2'},
            {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
            {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
        ], [..., {
            'id': 'name',
            'url': 'url',
            'ext': 'ext',
        }, all, {subs_list_to_dict(lang='de')}]) == {
            'de': [
                {'url': 'https://example.com/subs/de1'},
                {'url': 'https://example.com/subs/de2'},
                {'url': 'https://example.com/subs/de3'},
                {'url': 'https://example.com/subs/de4'},
            ],
        }, 'non str types should be replaced by default id'
    def test_trim_str(self):
        with pytest.raises(TypeError):
@ -525,7 +573,7 @@ class TestTraversalHelpers:
    def test_unpack(self):
        assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123'
        assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3'
-        assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3'
+        assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3'
        with pytest.raises(TypeError):
            unpack(join_nonempty)()
        with pytest.raises(TypeError):
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -72,7 +72,6 @@ from yt_dlp.utils import (
    intlist_to_bytes,
    iri_to_uri,
    is_html,
    join_nonempty,
    js_to_json,
    limit_length,
    locked_file,
@ -2158,10 +2157,6 @@ Line 1
        assert int_or_none(v=10) == 10, 'keyword passed positional should call function'
        assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function'
        assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially'
        assert callable(join_nonempty()), 'varargs positional should apply partially'
        assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function'
 if __name__ == '__main__':
    unittest.main()
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -4381,7 +4381,9 @@ class YoutubeDL:
            return None
        for idx, t in list(enumerate(thumbnails))[::-1]:
-            thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
+            thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg')
            if multiple:
                thumb_ext = f'{t["id"]}.{thumb_ext}'
            thumb_display_id = f'{label} thumbnail {t["id"]}'
            thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
            thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
--- a/yt_dlp/extractor/afreecatv.py
+++ b/yt_dlp/extractor/afreecatv.py
@ -66,6 +66,14 @@ class AfreecaTVBaseIE(InfoExtractor):
            extensions={'legacy_ssl': True}), display_id,
            'Downloading API JSON', 'Unable to download API JSON')
    @staticmethod
    def _fixup_thumb(thumb_url):
        if not url_or_none(thumb_url):
            return None
        # Core would determine_ext as 'php' from the url, so we need to provide the real ext
        # See: https://github.com/yt-dlp/yt-dlp/issues/11537
        return [{'url': thumb_url, 'ext': 'jpg'}]
 class AfreecaTVIE(AfreecaTVBaseIE):
    IE_NAME = 'soop'
@ -155,7 +163,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
            'uploader': ('writer_nick', {str}),
            'uploader_id': ('bj_id', {str}),
            'duration': ('total_file_duration', {int_or_none(scale=1000)}),
-            'thumbnail': ('thumb', {url_or_none}),
+            'thumbnails': ('thumb', {self._fixup_thumb}),
        })
        entries = []
@ -226,8 +234,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
        return self.playlist_result(self._entries(data), video_id)
-    @staticmethod
+    def _entries(self, data):
    def _entries(data):
        # 'files' is always a list with 1 element
        yield from traverse_obj(data, (
            'data', lambda _, v: v['story_type'] == 'catch',
@ -238,7 +245,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
                'title': ('title', {str}),
                'uploader': ('writer_nick', {str}),
                'uploader_id': ('writer_id', {str}),
-                'thumbnail': ('thumb', {url_or_none}),
+                'thumbnails': ('thumb', {self._fixup_thumb}),
                'timestamp': ('write_timestamp', {int_or_none}),
            }))
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -279,6 +279,7 @@ class InfoExtractor:
    thumbnails:     A list of dictionaries, with the following entries:
                        * "id" (optional, string) - Thumbnail format ID
                        * "url"
                        * "ext" (optional, string) - actual image extension if not given in URL
                        * "preference" (optional, int) - quality of the image
                        * "width" (optional, int)
                        * "height" (optional, int)
--- a/yt_dlp/postprocessor/ffmpeg.py
+++ b/yt_dlp/postprocessor/ffmpeg.py
@ -220,9 +220,20 @@ class FFmpegPostProcessor(PostProcessor):
    @staticmethod
    def stream_copy_opts(copy=True, *, ext=None):
        yield from ('-map', '0')
        if ext in ('mkv', 'mka'):
            # Some streams, such as JSON attachments, are considered of unknown
            # type by FFmpeg but we still want to copy them.
            yield '-copy_unknown'
        else:
            # Most containers don't really like unknown streams. Let's make
            # sure to get rid of them.
            yield '-ignore_unknown'
        # Don't copy Apple TV chapters track, bin_data
        # See https://github.com/yt-dlp/yt-dlp/issues/2, #19042, #19024, https://trac.ffmpeg.org/ticket/6016
-        yield from ('-dn', '-ignore_unknown')
+        yield '-dn'
        if copy:
            yield from ('-c', 'copy')
        if ext in ('mp4', 'mov', 'm4a'):
@ -557,7 +568,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
    @staticmethod
    def _options(target_ext):
-        yield from FFmpegPostProcessor.stream_copy_opts(False)
+        yield from FFmpegPostProcessor.stream_copy_opts(False, ext=target_ext)
        if target_ext == 'avi':
            yield from ('-c:v', 'libxvid', '-vtag', 'XVID')
@ -583,7 +594,7 @@ class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP):
    @staticmethod
    def _options(target_ext):
-        return FFmpegPostProcessor.stream_copy_opts()
+        return FFmpegPostProcessor.stream_copy_opts(ext=target_ext)
 class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
@ -620,13 +631,18 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
        webm_vtt_warn = False
        mp4_ass_warn = False
        json_subs = {}
        for lang, sub_info in subtitles.items():
            if not os.path.exists(sub_info.get('filepath', '')):
                self.report_warning(f'Skipping embedding {lang} subtitle because the file is missing')
                continue
            sub_ext = sub_info['ext']
            if sub_ext == 'json':
-                self.report_warning('JSON subtitles cannot be embedded')
+                if info['ext'] in ('mkv', 'mka'):
                    json_subs[lang] = sub_info['filepath']
                else:
                    self.report_warning('JSON subtitles can only be embedded in mkv/mka files.')
            elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
                sub_langs.append(lang)
                sub_names.append(sub_info.get('name'))
@ -639,31 +655,48 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
                mp4_ass_warn = True
                self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues')
-        if not sub_langs:
+        if not sub_langs and not json_subs:
            return [], info
        input_files = [filename, *sub_filenames]
-        opts = [
+        opts = [*self.stream_copy_opts(ext=info['ext'])]
-            *self.stream_copy_opts(ext=info['ext']),
+
-            # Don't copy the existing subtitles, we may be running the
+        if sub_langs and sub_names:
-            # postprocessor a second time
+            # We have regular subtitles available to embed. Don't copy the
            # existing subtitles, we may be running the postprocessor a second
            # time.
            opts.extend([
                '-map', '-0:s',
-        ]
+            ])
        for i, (lang, name) in enumerate(zip(sub_langs, sub_names)):
            opts.extend(['-map', f'{i + 1}:0'])
            lang_code = ISO639Utils.short2long(lang) or lang
-            opts.extend([f'-metadata:s:s:{i}', f'language={lang_code}'])
+            opts.extend([
                '-map', f'{i + 1}:0',
                f'-metadata:s:s:{i}', f'language={lang_code}',
            ])
            if name:
                opts.extend([f'-metadata:s:s:{i}', f'handler_name={name}',
                             f'-metadata:s:s:{i}', f'title={name}'])
        for json_lang, json_filename in json_subs.items():
            escaped_json_filename = self._ffmpeg_filename_argument(json_filename)
            json_basename = os.path.basename(json_filename)
            opts.extend([
                '-map', f'-0:m:filename:{json_lang}.json?',
                '-attach', escaped_json_filename,
                f'-metadata:s:m:filename:{json_basename}', 'mimetype=application/json',
                f'-metadata:s:m:filename:{json_basename}', f'filename={json_lang}.json',
            ])
        temp_filename = prepend_extension(filename, 'temp')
        self.to_screen(f'Embedding subtitles in "{filename}"')
        self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
        os.replace(temp_filename, filename)
-        files_to_delete = [] if self._already_have_subtitle else sub_filenames
+        files_to_delete = [] if self._already_have_subtitle else [*sub_filenames, *json_subs.values()]
        return files_to_delete, info
@ -678,7 +711,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
    @staticmethod
    def _options(target_ext):
        audio_only = target_ext == 'm4a'
-        yield from FFmpegPostProcessor.stream_copy_opts(not audio_only)
+        yield from FFmpegPostProcessor.stream_copy_opts(not audio_only, ext=target_ext)
        if audio_only:
            yield from ('-vn', '-acodec', 'copy')
@ -806,15 +839,20 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
            write_json_file(self._downloader.sanitize_info(info, self.get_param('clean_infojson', True)), infofn)
            info['infojson_filename'] = infofn
-        old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json')
+        escaped_name = self._ffmpeg_filename_argument(infofn)
-        if old_stream is not None:
+        info_basename = os.path.basename(infofn)
            yield ('-map', f'-0:{old_stream}')
            new_stream -= 1
        yield (
-            '-attach', self._ffmpeg_filename_argument(infofn),
+            # In order to override any old info.json reliably we need to
-            f'-metadata:s:{new_stream}', 'mimetype=application/json',
+            # instruct FFmpeg to consider valid tracks without a codec id, like
-            f'-metadata:s:{new_stream}', 'filename=info.json',
+            # JSON attachments.
            '-copy_unknown',
            # This map operation allows us to actually replace any previous
            # info.json data.
            '-map', '-0:m:filename:info.json?',
            '-attach', escaped_name,
            f'-metadata:s:m:filename:{info_basename}', 'mimetype=application/json',
            f'-metadata:s:m:filename:{info_basename}', 'filename=info.json',
        )
@ -873,7 +911,7 @@ class FFmpegFixupStretchedPP(FFmpegFixupPostProcessor):
        stretched_ratio = info.get('stretched_ratio')
        if stretched_ratio not in (None, 1):
            self._fixup('Fixing aspect ratio', info['filepath'], [
-                *self.stream_copy_opts(), '-aspect', f'{stretched_ratio:f}'])
+                *self.stream_copy_opts(ext=info['ext']), '-aspect', f'{stretched_ratio:f}'])
        return [], info
@ -881,7 +919,7 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor):
    @PostProcessor._restrict_to(images=False, video=False)
    def run(self, info):
        if info.get('container') == 'm4a_dash':
-            self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(), '-f', 'mp4'])
+            self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(ext=info['ext']), '-f', 'mp4'])
        return [], info
@ -904,7 +942,7 @@ class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
            if self.get_audio_codec(info['filepath']) == 'aac':
                args.extend(['-bsf:a', 'aac_adtstoasc'])
            self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
-                *self.stream_copy_opts(), *args])
+                *self.stream_copy_opts(ext=info['ext']), *args])
        return [], info
@ -925,7 +963,7 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor):
            opts = ['-vf', 'setpts=PTS-STARTPTS']
        else:
            opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS']
-        self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False), '-ss', self.trim])
+        self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False, ext=info['ext']), '-ss', self.trim])
        return [], info
@ -934,7 +972,7 @@ class FFmpegCopyStreamPP(FFmpegFixupPostProcessor):
    @PostProcessor._restrict_to(images=False)
    def run(self, info):
-        self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts())
+        self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts(ext=info['ext']))
        return [], info
@ -1063,7 +1101,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor):
        self.to_screen(f'Splitting video by chapters; {len(chapters)} chapters found')
        for idx, chapter in enumerate(chapters):
            destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
-            self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())])
+            self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts(ext=info['ext']))])
        if in_file != info['filepath']:
            self._delete_downloaded_files(in_file, msg=None)
        return [], info
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -216,7 +216,7 @@ def partial_application(func):
    sig = inspect.signature(func)
    required_args = [
        param.name for param in sig.parameters.values()
-        if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
+        if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
        if param.default is inspect.Parameter.empty
    ]
@ -4837,7 +4837,6 @@ def number_of_digits(number):
    return len('%d' % number)
@partial_application
 def join_nonempty(*values, delim='-', from_dict=None):
    if from_dict is not None:
        values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
--- a/yt_dlp/utils/traversal.py
+++ b/yt_dlp/utils/traversal.py
@ -332,14 +332,14 @@ class _RequiredError(ExtractorError):
@typing.overload
-def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
+def subs_list_to_dict(*, lang: str | None = 'und', ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
@typing.overload
-def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ...
+def subs_list_to_dict(subs: list[dict] | None, /, *, lang: str | None = 'und', ext: str | None = None) -> dict[str, list[dict]]: ...
-def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
+def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None):
    """
    Convert subtitles from a traversal into a subtitle dict.
    The path should have an `all` immediately before this function.
@ -352,7 +352,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
    `quality`  The sort order for each subtitle
    """
    if subs is None:
-        return functools.partial(subs_list_to_dict, ext=ext)
+        return functools.partial(subs_list_to_dict, lang=lang, ext=ext)
    result = collections.defaultdict(list)
@ -360,9 +360,15 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
        if not url_or_none(sub.get('url')) and not sub.get('data'):
            continue
        sub_id = sub.pop('id', None)
-        if sub_id is None:
+        if not isinstance(sub_id, str):
            if not lang:
                continue
-        if ext is not None and not sub.get('ext'):
+            sub_id = lang
        sub_ext = sub.get('ext')
        if not isinstance(sub_ext, str):
            if not ext:
                sub.pop('ext', None)
            else:
                sub['ext'] = ext
        result[sub_id].append(sub)
    result = dict(result)
@ -452,9 +458,9 @@ def trim_str(*, start=None, end=None):
    return trim
-def unpack(func):
+def unpack(func, **kwargs):
    @functools.wraps(func)
-    def inner(items, **kwargs):
+    def inner(items):
        return func(*items, **kwargs)
    return inner
Author	SHA1	Message	Date
Riteo	9b676aaaf2	Merge `1cae3bf46d` into `c699bafc50`	2024-11-16 01:03:02 +00:00
bashonly	c699bafc50	[ie/soop] Fix thumbnail extraction (#11545 ) Closes #11537 Authored by: bashonly	2024-11-15 22:51:55 +00:00
bashonly	eb64ae7d5d	[ie] Allow `ext` override for thumbnails (#11545 ) Authored by: bashonly	2024-11-15 22:51:55 +00:00
Simon Sawicki	c014fbcddc	[utils] `subs_list_to_dict`: Add `lang` default parameter (#11508 ) Authored by: Grub4K	2024-11-15 23:25:52 +01:00
Simon Sawicki	39d79c9b9c	[utils] Fix `join_nonempty`, add `**kwargs` to `unpack` (#11559 ) Authored by: Grub4K	2024-11-15 22:06:15 +01:00
Riteo	1cae3bf46d	Use unpack operator for files to delete	2024-11-08 03:52:50 +01:00
Riteo	4aa3c401d4	Do not pass `-map -0:s` multiple times	2024-11-08 03:49:39 +01:00
Riteo	0cc0f3f086	Merge remote-tracking branch 'origin/master' into json-subtitles	2024-11-08 03:44:09 +01:00
Riteo	85a844aef3	Select copy mode depending on extension	2024-09-11 11:43:33 +02:00
Riteo	17781f9d7d	Remove debug thing I'm dumb	2024-09-08 13:33:24 +02:00
Riteo	fc349670c3	Fix info attachment in subpaths	2024-09-08 13:30:35 +02:00
Riteo	4b5be635b1	Add missing comma (again) oops	2024-09-08 13:30:35 +02:00
Riteo	45d1f2bb6c	Fix attachments in subpaths	2024-09-08 13:30:32 +02:00
Riteo	7fb0c05ff6	Revert format check stuff	2024-09-08 13:04:59 +02:00
Riteo	aaa25eb508	Add missing trailing comma	2024-08-14 03:18:55 +02:00
Riteo	780bfd044f	Pass target extension to all stream_copy_opts instances	2024-08-14 03:05:11 +02:00
Riteo	fe5de0005e	Add extra checks for non-matroska formats when copying	2024-08-14 02:55:33 +02:00
Riteo	9db000a9af	Check also if there are json subtitles	2024-08-14 02:55:29 +02:00
Riteo	62e274f515	Move regular subtitles options to their loop	2024-08-14 02:10:14 +02:00
Riteo	e202aae5d6	Remove redundant copy_unknown	2024-08-14 02:03:09 +02:00
Riteo	3b8050da5b	Merge remote-tracking branch 'origin/master' into json-subtitles	2024-08-14 02:02:56 +02:00
Riteo	38a9f70044	Use a map for JSON sub handling instead of two lists	2024-08-14 01:16:15 +02:00
Riteo	550b3a046a	Use the -copy_unknown flag in the stream copy otions Also split the yield expression as the comment above was a bit misleading (it was only related to the `-dn` flag).	2024-08-13 22:30:08 +02:00
Riteo	ba3a7232f0	[pp/FFmpegEmbedSubtitle] Embed JSON subtitles as Matroska attachments Since we can't embed them as regular subtitles (due to them not having any consistent structure), we embed them as file attachments, if exporting as Matroska. This allows us to have single-file downloads with everything embedded for e.g. archival purposes.	2024-06-14 16:56:54 +02:00
Riteo	339828d777	[pp/FFmpegMetadata] Use metadata stream specifier for info.json The old stream index specifiers would indiscriminately select any JSON attachment, which made stuff like embedding live chat json data risky if not impossible. Also adds `-copy_unknown` as JSON data is "unknown" according to FFmpeg (since it has no codec id) and thus would otherwise be rejected by default.	2024-06-14 16:56:52 +02:00