Merge 172bfb74b0 into a9f85670d0

2024-11-26 01:01:25 +01:00 · 2024-11-12 21:56:28 +08:00
11 changed files with 30 additions and 239 deletions
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
@ -481,7 +481,7 @@ class TestTraversalHelpers:
            'id': 'name',
            'data': 'content',
            'url': 'url',
-        }, all, {subs_list_to_dict(lang=None)}]) == {
+        }, all, {subs_list_to_dict}]) == {
            'de': [{'url': 'https://example.com/subs/de.ass'}],
            'en': [{'data': 'content'}],
        }, 'subs with mandatory items missing should be filtered'
@ -507,54 +507,6 @@ class TestTraversalHelpers:
            {'url': 'https://example.com/subs/en1', 'ext': 'ext'},
            {'url': 'https://example.com/subs/en2', 'ext': 'ext'},
        ]}, '`quality` key should sort subtitle list accordingly'
-        assert traverse_obj([
-            {'name': 'de', 'url': 'https://example.com/subs/de.ass'},
-            {'name': 'de'},
-            {'name': 'en', 'content': 'content'},
-            {'url': 'https://example.com/subs/en'},
-        ], [..., {
-            'id': 'name',
-            'url': 'url',
-            'data': 'content',
-        }, all, {subs_list_to_dict(lang='en')}]) == {
-            'de': [{'url': 'https://example.com/subs/de.ass'}],
-            'en': [
-                {'data': 'content'},
-                {'url': 'https://example.com/subs/en'},
-            ],
-        }, 'optionally provided lang should be used if no id available'
-        assert traverse_obj([
-            {'name': 1, 'url': 'https://example.com/subs/de1'},
-            {'name': {}, 'url': 'https://example.com/subs/de2'},
-            {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
-            {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
-        ], [..., {
-            'id': 'name',
-            'url': 'url',
-            'ext': 'ext',
-        }, all, {subs_list_to_dict(lang=None)}]) == {
-            'de': [
-                {'url': 'https://example.com/subs/de3'},
-                {'url': 'https://example.com/subs/de4'},
-            ],
-        }, 'non str types should be ignored for id and ext'
-        assert traverse_obj([
-            {'name': 1, 'url': 'https://example.com/subs/de1'},
-            {'name': {}, 'url': 'https://example.com/subs/de2'},
-            {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
-            {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
-        ], [..., {
-            'id': 'name',
-            'url': 'url',
-            'ext': 'ext',
-        }, all, {subs_list_to_dict(lang='de')}]) == {
-            'de': [
-                {'url': 'https://example.com/subs/de1'},
-                {'url': 'https://example.com/subs/de2'},
-                {'url': 'https://example.com/subs/de3'},
-                {'url': 'https://example.com/subs/de4'},
-            ],
-        }, 'non str types should be replaced by default id'

    def test_trim_str(self):
        with pytest.raises(TypeError):
@ -573,7 +525,7 @@ class TestTraversalHelpers:
    def test_unpack(self):
        assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123'
        assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3'
-        assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3'
+        assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3'
        with pytest.raises(TypeError):
            unpack(join_nonempty)()
        with pytest.raises(TypeError):
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -72,6 +72,7 @@ from yt_dlp.utils import (
    intlist_to_bytes,
    iri_to_uri,
    is_html,
+    join_nonempty,
    js_to_json,
    limit_length,
    locked_file,
@ -2157,6 +2158,10 @@ Line 1
        assert int_or_none(v=10) == 10, 'keyword passed positional should call function'
        assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function'

+        assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially'
+        assert callable(join_nonempty()), 'varargs positional should apply partially'
+        assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function'
+

 if __name__ == '__main__':
    unittest.main()
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -4381,9 +4381,7 @@ class YoutubeDL:
            return None

        for idx, t in list(enumerate(thumbnails))[::-1]:
-            thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg')
-            if multiple:
-                thumb_ext = f'{t["id"]}.{thumb_ext}'
+            thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
            thumb_display_id = f'{label} thumbnail {t["id"]}'
            thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
            thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1650,7 +1650,6 @@ from .radiokapital import (
    RadioKapitalIE,
    RadioKapitalShowIE,
 )
-from .radioradicale import RadioRadicaleIE
 from .radiozet import RadioZetPodcastIE
 from .radlive import (
    RadLiveChannelIE,
--- a/yt_dlp/extractor/afreecatv.py
+++ b/yt_dlp/extractor/afreecatv.py
@ -66,14 +66,6 @@ class AfreecaTVBaseIE(InfoExtractor):
            extensions={'legacy_ssl': True}), display_id,
            'Downloading API JSON', 'Unable to download API JSON')

-    @staticmethod
-    def _fixup_thumb(thumb_url):
-        if not url_or_none(thumb_url):
-            return None
-        # Core would determine_ext as 'php' from the url, so we need to provide the real ext
-        # See: https://github.com/yt-dlp/yt-dlp/issues/11537
-        return [{'url': thumb_url, 'ext': 'jpg'}]
-

 class AfreecaTVIE(AfreecaTVBaseIE):
    IE_NAME = 'soop'
@ -163,7 +155,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
            'uploader': ('writer_nick', {str}),
            'uploader_id': ('bj_id', {str}),
            'duration': ('total_file_duration', {int_or_none(scale=1000)}),
-            'thumbnails': ('thumb', {self._fixup_thumb}),
+            'thumbnail': ('thumb', {url_or_none}),
        })

        entries = []
@ -234,7 +226,8 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):

        return self.playlist_result(self._entries(data), video_id)

-    def _entries(self, data):
+    @staticmethod
+    def _entries(data):
        # 'files' is always a list with 1 element
        yield from traverse_obj(data, (
            'data', lambda _, v: v['story_type'] == 'catch',
@ -245,7 +238,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
                'title': ('title', {str}),
                'uploader': ('writer_nick', {str}),
                'uploader_id': ('writer_id', {str}),
-                'thumbnails': ('thumb', {self._fixup_thumb}),
+                'thumbnail': ('thumb', {url_or_none}),
                'timestamp': ('write_timestamp', {int_or_none}),
            }))

--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@ -205,26 +205,6 @@ class ArchiveOrgIE(InfoExtractor):
                },
            },
        ],
-    }, {
-        # The reviewbody is None for one of the reviews; just need to extract data without crashing
-        'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
-        'info_dict': {
-            'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
-            'ext': 'mp3',
-            'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
-            'creators': ['Grateful Dead'],
-            'duration': 338.31,
-            'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
-            'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
-            'display_id': 'gd95-04-02d1t04.shn',
-            'location': 'Pyramid Arena',
-            'uploader': 'jon@archive.org',
-            'album': '1995-04-02 - Pyramid Arena',
-            'upload_date': '20040519',
-            'track_number': 4,
-            'release_date': '19950402',
-            'timestamp': 1084927901,
-        },
    }]

    @staticmethod
@ -355,7 +335,7 @@ class ArchiveOrgIE(InfoExtractor):
                info['comments'].append({
                    'id': review.get('review_id'),
                    'author': review.get('reviewer'),
-                    'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
+                    'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
                    'timestamp': unified_timestamp(review.get('createdate')),
                    'parent': 'root'})

--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -279,7 +279,6 @@ class InfoExtractor:
    thumbnails:     A list of dictionaries, with the following entries:
                        * "id" (optional, string) - Thumbnail format ID
                        * "url"
-                        * "ext" (optional, string) - actual image extension if not given in URL
                        * "preference" (optional, int) - quality of the image
                        * "width" (optional, int)
                        * "height" (optional, int)
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
                return extract_video_data(try_get(
                    js_data, lambda x: x['jsmods']['instances'], list) or [])

-        def extract_dash_manifest(vid_data, formats, mpd_url=None):
+        def extract_dash_manifest(video, formats):
            dash_manifest = traverse_obj(
-                vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
+                video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str)
            if dash_manifest:
                formats.extend(self._parse_mpd_formats(
                    compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
-                    mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
+                    mpd_url=url_or_none(video.get('dash_manifest_url'))))

        def process_formats(info):
            # Downloads with browser's User-Agent are rate limited. Working around
@ -619,12 +619,9 @@ class FacebookIE(InfoExtractor):
                        video = video['creation_story']
                        video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
                        video.update(reel_info)
-
+                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
                    formats = []
                    q = qualities(['sd', 'hd'])
-
-                    # Legacy formats extraction
-                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
                    for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
                                           ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
                                           ('browser_native_sd_url', 'sd')):
@ -632,7 +629,7 @@ class FacebookIE(InfoExtractor):
                        if not playable_url:
                            continue
                        if determine_ext(playable_url) == 'mpd':
-                            formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
+                            formats.extend(self._extract_mpd_formats(playable_url, video_id))
                        else:
                            formats.append({
                                'format_id': format_id,
@ -641,28 +638,6 @@ class FacebookIE(InfoExtractor):
                                'url': playable_url,
                            })
                    extract_dash_manifest(fmt_data, formats)
-
-                    # New videoDeliveryResponse formats extraction
-                    fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
-                    mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
-                    dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
-                    for idx, dash_manifest in enumerate(dash_manifests):
-                        extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
-                    if not dash_manifests:
-                        # Only extract from MPD URLs if the manifests are not already provided
-                        for mpd_url in mpd_urls:
-                            formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
-                    for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
-                        format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
-                        formats.append({
-                            'format_id': format_id,
-                            # sd, hd formats w/o resolution info should be deprioritized below DASH
-                            'quality': q(format_id) - 3,
-                            'url': prog_fmt['progressive_url'],
-                        })
-                    for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
-                        formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
-
                    if not formats:
                        # Do not append false positive entry w/o any formats
                        return
--- a/yt_dlp/extractor/radioradicale.py
+++ b/yt_dlp/extractor/radioradicale.py
@ -1,105 +0,0 @@
-from .common import InfoExtractor
-from ..utils import url_or_none
-from ..utils.traversal import traverse_obj
-
-
-class RadioRadicaleIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?radioradicale\.it/scheda/(?P<id>[0-9]+)'
-    _TESTS = [{
-        'url': 'https://www.radioradicale.it/scheda/471591',
-        'md5': 'eb0fbe43a601f1a361cbd00f3c45af4a',
-        'info_dict': {
-            'id': '471591',
-            'ext': 'mp4',
-            'title': 'md5:e8fbb8de57011a3255db0beca69af73d',
-            'description': 'md5:5e15a789a2fe4d67da8d1366996e89ef',
-            'location': 'Napoli',
-            'duration': 2852.0,
-            'timestamp': 1459987200,
-            'upload_date': '20160407',
-            'thumbnail': 'https://www.radioradicale.it/photo400/0/0/9/0/1/00901768.jpg',
-        },
-    }, {
-        'url': 'https://www.radioradicale.it/scheda/742783/parlamento-riunito-in-seduta-comune-11a-della-xix-legislatura',
-        'info_dict': {
-            'id': '742783',
-            'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
-            'description': '-) Votazione per l\'elezione di un giudice della Corte Costituzionale (nono scrutinio)',
-            'location': 'CAMERA',
-            'duration': 5868.0,
-            'timestamp': 1730246400,
-            'upload_date': '20241030',
-        },
-        'playlist': [{
-            'md5': 'aa48de55dcc45478e4cd200f299aab7d',
-            'info_dict': {
-                'id': '742783-0',
-                'ext': 'mp4',
-                'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
-            },
-        }, {
-            'md5': 'be915c189c70ad2920e5810f32260ff5',
-            'info_dict': {
-                'id': '742783-1',
-                'ext': 'mp4',
-                'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
-            },
-        }, {
-            'md5': 'f0ee4047342baf8ed3128a8417ac5e0a',
-            'info_dict': {
-                'id': '742783-2',
-                'ext': 'mp4',
-                'title': 'Parlamento riunito in seduta comune (11ª della XIX legislatura)',
-            },
-        }],
-    }]
-
-    def _entries(self, videos_info, page_id):
-        for idx, video in enumerate(traverse_obj(
-                videos_info, ('playlist', lambda _, v: v['sources']))):
-            video_id = f'{page_id}-{idx}'
-            formats = []
-            subtitles = {}
-
-            for m3u8_url in traverse_obj(video, ('sources', ..., 'src', {url_or_none})):
-                fmts, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id)
-                formats.extend(fmts)
-                self._merge_subtitles(subs, target=subtitles)
-            for sub in traverse_obj(video, ('subtitles', ..., lambda _, v: url_or_none(v['src']))):
-                self._merge_subtitles({sub.get('srclang') or 'und': [{
-                    'url': sub['src'],
-                    'name': sub.get('label'),
-                }]}, target=subtitles)
-
-            yield {
-                'id': video_id,
-                'title': video.get('title'),
-                'formats': formats,
-                'subtitles': subtitles,
-            }
-
-    def _real_extract(self, url):
-        page_id = self._match_id(url)
-        webpage = self._download_webpage(url, page_id)
-
-        videos_info = self._search_json(
-            r'jQuery\.extend\(Drupal\.settings\s*,',
-            webpage, 'videos_info', page_id)['RRscheda']
-
-        entries = list(self._entries(videos_info, page_id))
-
-        common_info = {
-            'id': page_id,
-            'title': self._og_search_title(webpage),
-            'description': self._og_search_description(webpage),
-            'location': videos_info.get('luogo'),
-            **self._search_json_ld(webpage, page_id),
-        }
-
-        if len(entries) == 1:
-            return {
-                **entries[0],
-                **common_info,
-            }
-
-        return self.playlist_result(entries, multi_video=True, **common_info)
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -216,7 +216,7 @@ def partial_application(func):
    sig = inspect.signature(func)
    required_args = [
        param.name for param in sig.parameters.values()
-        if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
+        if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
        if param.default is inspect.Parameter.empty
    ]

@ -4837,6 +4837,7 @@ def number_of_digits(number):
    return len('%d' % number)


+@partial_application
 def join_nonempty(*values, delim='-', from_dict=None):
    if from_dict is not None:
        values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
--- a/yt_dlp/utils/traversal.py
+++ b/yt_dlp/utils/traversal.py
@ -332,14 +332,14 @@ class _RequiredError(ExtractorError):


@typing.overload
-def subs_list_to_dict(*, lang: str | None = 'und', ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
+def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...


@typing.overload
-def subs_list_to_dict(subs: list[dict] | None, /, *, lang: str | None = 'und', ext: str | None = None) -> dict[str, list[dict]]: ...
+def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ...


-def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None):
+def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
    """
    Convert subtitles from a traversal into a subtitle dict.
    The path should have an `all` immediately before this function.
@ -352,7 +352,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None
    `quality`  The sort order for each subtitle
    """
    if subs is None:
-        return functools.partial(subs_list_to_dict, lang=lang, ext=ext)
+        return functools.partial(subs_list_to_dict, ext=ext)

    result = collections.defaultdict(list)

@ -360,16 +360,10 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None
        if not url_or_none(sub.get('url')) and not sub.get('data'):
            continue
        sub_id = sub.pop('id', None)
-        if not isinstance(sub_id, str):
-            if not lang:
-                continue
-            sub_id = lang
-        sub_ext = sub.get('ext')
-        if not isinstance(sub_ext, str):
-            if not ext:
-                sub.pop('ext', None)
-            else:
-                sub['ext'] = ext
+        if sub_id is None:
+            continue
+        if ext is not None and not sub.get('ext'):
+            sub['ext'] = ext
        result[sub_id].append(sub)
    result = dict(result)

@ -458,9 +452,9 @@ def trim_str(*, start=None, end=None):
    return trim


-def unpack(func, **kwargs):
+def unpack(func):
    @functools.wraps(func)
-    def inner(items):
+    def inner(items, **kwargs):
        return func(*items, **kwargs)

    return inner