Merge 8779a8897c into c699bafc50

simplify statements in traversal
merge 'master'
2024-11-26 01:01:25 +01:00 · 2024-11-16 07:40:24 +00:00 · 2024-11-16 07:39:59 +00:00 · 2024-11-16 07:22:46 +00:00 · 2024-11-15 22:51:55 +00:00 · 2024-11-15 22:51:55 +00:00
10 changed files with 510 additions and 159 deletions
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
@ -481,7 +481,7 @@ class TestTraversalHelpers:
            'id': 'name',
            'data': 'content',
            'url': 'url',
-        }, all, {subs_list_to_dict}]) == {
+        }, all, {subs_list_to_dict(lang=None)}]) == {
            'de': [{'url': 'https://example.com/subs/de.ass'}],
            'en': [{'data': 'content'}],
        }, 'subs with mandatory items missing should be filtered'
@ -507,6 +507,54 @@ class TestTraversalHelpers:
            {'url': 'https://example.com/subs/en1', 'ext': 'ext'},
            {'url': 'https://example.com/subs/en2', 'ext': 'ext'},
        ]}, '`quality` key should sort subtitle list accordingly'
+        assert traverse_obj([
+            {'name': 'de', 'url': 'https://example.com/subs/de.ass'},
+            {'name': 'de'},
+            {'name': 'en', 'content': 'content'},
+            {'url': 'https://example.com/subs/en'},
+        ], [..., {
+            'id': 'name',
+            'url': 'url',
+            'data': 'content',
+        }, all, {subs_list_to_dict(lang='en')}]) == {
+            'de': [{'url': 'https://example.com/subs/de.ass'}],
+            'en': [
+                {'data': 'content'},
+                {'url': 'https://example.com/subs/en'},
+            ],
+        }, 'optionally provided lang should be used if no id available'
+        assert traverse_obj([
+            {'name': 1, 'url': 'https://example.com/subs/de1'},
+            {'name': {}, 'url': 'https://example.com/subs/de2'},
+            {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
+            {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
+        ], [..., {
+            'id': 'name',
+            'url': 'url',
+            'ext': 'ext',
+        }, all, {subs_list_to_dict(lang=None)}]) == {
+            'de': [
+                {'url': 'https://example.com/subs/de3'},
+                {'url': 'https://example.com/subs/de4'},
+            ],
+        }, 'non str types should be ignored for id and ext'
+        assert traverse_obj([
+            {'name': 1, 'url': 'https://example.com/subs/de1'},
+            {'name': {}, 'url': 'https://example.com/subs/de2'},
+            {'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
+            {'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
+        ], [..., {
+            'id': 'name',
+            'url': 'url',
+            'ext': 'ext',
+        }, all, {subs_list_to_dict(lang='de')}]) == {
+            'de': [
+                {'url': 'https://example.com/subs/de1'},
+                {'url': 'https://example.com/subs/de2'},
+                {'url': 'https://example.com/subs/de3'},
+                {'url': 'https://example.com/subs/de4'},
+            ],
+        }, 'non str types should be replaced by default id'

    def test_trim_str(self):
        with pytest.raises(TypeError):
@ -525,7 +573,7 @@ class TestTraversalHelpers:
    def test_unpack(self):
        assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123'
        assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3'
-        assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3'
+        assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3'
        with pytest.raises(TypeError):
            unpack(join_nonempty)()
        with pytest.raises(TypeError):
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -72,7 +72,6 @@ from yt_dlp.utils import (
    intlist_to_bytes,
    iri_to_uri,
    is_html,
-    join_nonempty,
    js_to_json,
    limit_length,
    locked_file,
@ -2158,10 +2157,6 @@ Line 1
        assert int_or_none(v=10) == 10, 'keyword passed positional should call function'
        assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function'

-        assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially'
-        assert callable(join_nonempty()), 'varargs positional should apply partially'
-        assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function'
-

 if __name__ == '__main__':
    unittest.main()
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -4381,7 +4381,9 @@ class YoutubeDL:
            return None

        for idx, t in list(enumerate(thumbnails))[::-1]:
-            thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
+            thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg')
+            if multiple:
+                thumb_ext = f'{t["id"]}.{thumb_ext}'
            thumb_display_id = f'{label} thumbnail {t["id"]}'
            thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
            thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))
--- a/yt_dlp/extractor/afreecatv.py
+++ b/yt_dlp/extractor/afreecatv.py
@ -66,6 +66,14 @@ class AfreecaTVBaseIE(InfoExtractor):
            extensions={'legacy_ssl': True}), display_id,
            'Downloading API JSON', 'Unable to download API JSON')

+    @staticmethod
+    def _fixup_thumb(thumb_url):
+        if not url_or_none(thumb_url):
+            return None
+        # Core would determine_ext as 'php' from the url, so we need to provide the real ext
+        # See: https://github.com/yt-dlp/yt-dlp/issues/11537
+        return [{'url': thumb_url, 'ext': 'jpg'}]
+

 class AfreecaTVIE(AfreecaTVBaseIE):
    IE_NAME = 'soop'
@ -155,7 +163,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
            'uploader': ('writer_nick', {str}),
            'uploader_id': ('bj_id', {str}),
            'duration': ('total_file_duration', {int_or_none(scale=1000)}),
-            'thumbnail': ('thumb', {url_or_none}),
+            'thumbnails': ('thumb', {self._fixup_thumb}),
        })

        entries = []
@ -226,8 +234,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):

        return self.playlist_result(self._entries(data), video_id)

-    @staticmethod
-    def _entries(data):
+    def _entries(self, data):
        # 'files' is always a list with 1 element
        yield from traverse_obj(data, (
            'data', lambda _, v: v['story_type'] == 'catch',
@ -238,7 +245,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
                'title': ('title', {str}),
                'uploader': ('writer_nick', {str}),
                'uploader_id': ('writer_id', {str}),
-                'thumbnail': ('thumb', {url_or_none}),
+                'thumbnails': ('thumb', {self._fixup_thumb}),
                'timestamp': ('write_timestamp', {int_or_none}),
            }))

--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
                },
            },
        ],
+    }, {
+        # The reviewbody is None for one of the reviews; just need to extract data without crashing
+        'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
+        'info_dict': {
+            'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
+            'ext': 'mp3',
+            'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
+            'creators': ['Grateful Dead'],
+            'duration': 338.31,
+            'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
+            'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
+            'display_id': 'gd95-04-02d1t04.shn',
+            'location': 'Pyramid Arena',
+            'uploader': 'jon@archive.org',
+            'album': '1995-04-02 - Pyramid Arena',
+            'upload_date': '20040519',
+            'track_number': 4,
+            'release_date': '19950402',
+            'timestamp': 1084927901,
+        },
    }]

    @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
                info['comments'].append({
                    'id': review.get('review_id'),
                    'author': review.get('reviewer'),
-                    'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
+                    'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
                    'timestamp': unified_timestamp(review.get('createdate')),
                    'parent': 'root'})

--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -279,6 +279,7 @@ class InfoExtractor:
    thumbnails:     A list of dictionaries, with the following entries:
                        * "id" (optional, string) - Thumbnail format ID
                        * "url"
+                        * "ext" (optional, string) - actual image extension if not given in URL
                        * "preference" (optional, int) - quality of the image
                        * "width" (optional, int)
                        * "height" (optional, int)
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
                return extract_video_data(try_get(
                    js_data, lambda x: x['jsmods']['instances'], list) or [])

-        def extract_dash_manifest(video, formats):
+        def extract_dash_manifest(vid_data, formats, mpd_url=None):
            dash_manifest = traverse_obj(
-                video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str)
+                vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
            if dash_manifest:
                formats.extend(self._parse_mpd_formats(
                    compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
-                    mpd_url=url_or_none(video.get('dash_manifest_url'))))
+                    mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))

        def process_formats(info):
            # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
                        video = video['creation_story']
                        video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
                        video.update(reel_info)
-                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
+
                    formats = []
                    q = qualities(['sd', 'hd'])
+
+                    # Legacy formats extraction
+                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
                    for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
                                           ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
                                           ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
                        if not playable_url:
                            continue
                        if determine_ext(playable_url) == 'mpd':
-                            formats.extend(self._extract_mpd_formats(playable_url, video_id))
+                            formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
                        else:
                            formats.append({
                                'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
                                'url': playable_url,
                            })
                    extract_dash_manifest(fmt_data, formats)
+
+                    # New videoDeliveryResponse formats extraction
+                    fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
+                    mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
+                    dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
+                    for idx, dash_manifest in enumerate(dash_manifests):
+                        extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
+                    if not dash_manifests:
+                        # Only extract from MPD URLs if the manifests are not already provided
+                        for mpd_url in mpd_urls:
+                            formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
+                    for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
+                        format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
+                        formats.append({
+                            'format_id': format_id,
+                            # sd, hd formats w/o resolution info should be deprioritized below DASH
+                            'quality': q(format_id) - 3,
+                            'url': prog_fmt['progressive_url'],
+                        })
+                    for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
+                        formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
+
                    if not formats:
                        # Do not append false positive entry w/o any formats
                        return
--- a/yt_dlp/extractor/vidio.py
+++ b/yt_dlp/extractor/vidio.py
@ -1,18 +1,24 @@
+import json
+
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    clean_html,
-    format_field,
+    extract_attributes,
    get_element_by_class,
+    get_element_html_by_id,
    int_or_none,
    parse_iso8601,
+    remove_end,
    smuggle_url,
    str_or_none,
-    strip_or_none,
+    str_to_int,
    try_get,
    unsmuggle_url,
+    url_or_none,
    urlencode_postdata,
 )
+from ..utils.traversal import traverse_obj


 class VidioBaseIE(InfoExtractor):
@ -35,6 +41,7 @@ class VidioBaseIE(InfoExtractor):
        login_form.update({
            'user[login]': username,
            'user[password]': password,
+            'authenticity_token': self._html_search_meta('csrf-token', login_page, fatal=True),
        })
        login_post, login_post_urlh = self._download_webpage_handle(
            self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), expected_status=[302, 401])
@ -58,6 +65,7 @@ class VidioBaseIE(InfoExtractor):
    def _initialize_pre_login(self):
        self._api_key = self._download_json(
            'https://www.vidio.com/auth', None, data=b'')['api_key']
+        self._ua = self.get_param('http_headers')['User-Agent']

    def _call_api(self, url, video_id, note=None):
        return self._download_json(url, video_id, note=note, headers={
@ -67,7 +75,9 @@ class VidioBaseIE(InfoExtractor):


 class VidioIE(VidioBaseIE):
+    _GEO_COUNTRIES = ['ID']
    _VALID_URL = r'https?://(?:www\.)?vidio\.com/(watch|embed)/(?P<id>\d+)-(?P<display_id>[^/?#&]+)'
+    _EMBED_REGEX = [rf'(?x)<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})']
    _TESTS = [{
        'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015',
        'md5': 'abac81b1a205a8d94c609a473b5ea62a',
@ -77,113 +87,317 @@ class VidioIE(VidioBaseIE):
            'ext': 'mp4',
            'title': 'DJ_AMBRED - Booyah (Live 2015)',
            'description': 'md5:27dc15f819b6a78a626490881adbadf8',
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
            'duration': 149,
-            'like_count': int,
-            'uploader': 'TWELVE Pic',
-            'timestamp': 1444902800,
+            'uploader': 'twelvepictures',
+            'timestamp': 1444902960,
            'upload_date': '20151015',
-            'uploader_id': 'twelvepictures',
-            'channel': 'Cover Music Video',
+            'uploader_id': '270115',
+            'channel': 'cover-music-video',
            'channel_id': '280236',
-            'view_count': int,
-            'dislike_count': int,
-            'comment_count': int,
+            'channel_url': 'https://www.vidio.com/@twelvepictures/channels/280236-cover-music-video',
            'tags': 'count:3',
            'uploader_url': 'https://www.vidio.com/@twelvepictures',
+            'live_status': 'not_live',
+            'genres': ['vlog', 'comedy', 'edm'],
+            'season_id': '',
+            'season_name': '',
+            'age_limit': 13,
+            'comment_count': int,
+        },
+        'params': {
+            'getcomments': True,
        },
    }, {
+        # DRM protected
+        'url': 'https://www.vidio.com/watch/7095853-ep-04-sketch-book',
+        'md5': 'abac81b1a205a8d94c609a473b5ea62a',
+        'info_dict': {
+            'id': '7095853',
+            'display_id': 'ep-04-sketch-book',
+            'ext': 'mp4',
+            'title': 'Ep 04 - Sketch Book',
+            'description': 'md5:9e22b4b1dbd65209c143d7009e899830',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'duration': 2784,
+            'uploader': 'vidiooriginal',
+            'timestamp': 1658509200,
+            'upload_date': '20220722',
+            'uploader_id': '31052580',
+            'channel': 'cupcake-untuk-rain',
+            'channel_id': '52332655',
+            'channel_url': 'https://www.vidio.com/@vidiooriginal/channels/52332655-cupcake-untuk-rain',
+            'tags': [],
+            'uploader_url': 'https://www.vidio.com/@vidiooriginal',
+            'live_status': 'not_live',
+            'genres': ['romance', 'drama', 'comedy', 'Teen', 'love triangle'],
+            'season_id': '8220',
+            'season_name': 'Season 1',
+            'age_limit': 13,
+            'availability': 'premium_only',
+            'comment_count': int,
+        },
+        'expected_warnings': ['This video is DRM protected'],
+        'params': {
+            'getcomments': True,
+            'skip_download': True,
+            'ignore_no_formats_error': True,
+        },
+    }, {
+        'url': 'https://www.vidio.com/watch/7439193-episode-1-magic-5',
+        'md5': 'b1644c574aeb20c91503be367ac2d211',
+        'info_dict': {
+            'id': '7439193',
+            'display_id': 'episode-1-magic-5',
+            'ext': 'mp4',
+            'title': 'Episode 1 - Magic 5',
+            'description': 'md5:367255f9e8e7ad7192c26218f01b6260',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'duration': 6126,
+            'uploader': 'indosiar',
+            'timestamp': 1679315400,
+            'upload_date': '20230320',
+            'uploader_id': '12',
+            'channel': 'magic-5',
+            'channel_id': '52350795',
+            'channel_url': 'https://www.vidio.com/@indosiar/channels/52350795-magic-5',
+            'tags': ['basmalah', 'raden-rakha', 'eby-da-5', 'sinetron', 'afan-da-5', 'sridevi-da5'],
+            'uploader_url': 'https://www.vidio.com/@indosiar',
+            'live_status': 'not_live',
+            'genres': ['drama', 'fantasy', 'friendship'],
+            'season_id': '11017',
+            'season_name': 'Episode',
+            'age_limit': 13,
+        },
+    }, {
+        'url': 'https://www.vidio.com/watch/1716926-mas-suka-masukin-aja',
+        'md5': 'acc4009eeac0033328419aada7bc6925',
+        'info_dict': {
+            'id': '1716926',
+            'display_id': 'mas-suka-masukin-aja',
+            'ext': 'mp4',
+            'title': 'Mas Suka, Masukin Aja',
+            'description': 'md5:667093b08e07b6fb92f68037f81f2267',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'duration': 5080,
+            'uploader': 'vidiopremier',
+            'timestamp': 1564735560,
+            'upload_date': '20190802',
+            'uploader_id': '26094842',
+            'channel': 'mas-suka-masukin-aja',
+            'channel_id': '34112289',
+            'channel_url': 'https://www.vidio.com/@vidiopremier/channels/34112289-mas-suka-masukin-aja',
+            'tags': [],
+            'uploader_url': 'https://www.vidio.com/@vidiopremier',
+            'live_status': 'not_live',
+            'genres': ['comedy', 'romance'],
+            'season_id': '663',
+            'season_name': '',
+            'age_limit': 18,
+            'availability': 'premium_only',
+        },
+        'params': {
+            'ignore_no_formats_error': True,
+        },
+        'expected_warnings': ['This show isn\'t available in your country'],
+    }, {
+        'url': 'https://www.vidio.com/watch/2372948-first-day-of-school-kindergarten-life-song-beabeo-nursery-rhymes-kids-songs',
+        'md5': 'c6d1bde08eee88bea27cca9dc38bc3df',
+        'info_dict': {
+            'id': '2372948',
+            'display_id': 'first-day-of-school-kindergarten-life-song-beabeo-nursery-rhymes-kids-songs',
+            'ext': 'mp4',
+            'title': 'First Day of School | Kindergarten Life Song | BeaBeo Nursery Rhymes & Kids Songs',
+            'description': 'md5:d505486a67415903f7f3ab61adfd5a91',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'duration': 517,
+            'uploader': 'kidsstartv',
+            'timestamp': 1638518400,
+            'upload_date': '20211203',
+            'uploader_id': '38247189',
+            'channel': 'beabeo-school-series',
+            'channel_id': '52311987',
+            'channel_url': 'https://www.vidio.com/@kidsstartv/channels/52311987-beabeo-school-series',
+            'tags': [],
+            'uploader_url': 'https://www.vidio.com/@kidsstartv',
+            'live_status': 'not_live',
+            'genres': ['animation', 'Cartoon'],
+            'season_id': '6023',
+            'season_name': 'school series',
+        },
+    }, {
+        'url': 'https://www.vidio.com/watch/1550718-stand-by-me-doraemon',
+        'md5': '405b61a2f06c74e052e0bd67cad6b891',
+        'info_dict': {
+            'id': '1550718',
+            'display_id': 'stand-by-me-doraemon',
+            'ext': 'mp4',
+            'title': 'Stand by Me Doraemon',
+            'description': 'md5:673d899f6a58dd4b0d18aebe30545e2a',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'duration': 5429,
+            'uploader': 'vidiopremier',
+            'timestamp': 1545815634,
+            'upload_date': '20181226',
+            'uploader_id': '26094842',
+            'channel': 'stand-by-me-doraemon',
+            'channel_id': '29750953',
+            'channel_url': 'https://www.vidio.com/@vidiopremier/channels/29750953-stand-by-me-doraemon',
+            'tags': ['anime-lucu', 'top-10-this-week', 'kids', 'stand-by-me-doraemon-2'],
+            'uploader_url': 'https://www.vidio.com/@vidiopremier',
+            'live_status': 'not_live',
+            'genres': ['anime', 'family', 'adventure', 'comedy', 'coming of age'],
+            'season_id': '237',
+            'season_name': '',
+            'age_limit': 7,
+            'availability': 'premium_only',
+        },
+        'params': {
+            'ignore_no_formats_error': True,
+        },
+        'expected_warnings': ['This show isn\'t available in your country'],
+    }, {
+        # 404 Not Found
        'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north',
        'only_matching': True,
-    }, {
-        # Premier-exclusive video
-        'url': 'https://www.vidio.com/watch/1550718-stand-by-me-doraemon',
-        'only_matching': True,
-    }, {
-        # embed url from https://enamplus.liputan6.com/read/5033648/video-fakta-temuan-suspek-cacar-monyet-di-jawa-tengah
-        'url': 'https://www.vidio.com/embed/7115874-fakta-temuan-suspek-cacar-monyet-di-jawa-tengah',
+    }]
+    _WEBPAGE_TESTS = [{
+        # embed player: https://www.vidio.com/embed/7115874-fakta-temuan-suspek-cacar-monyet-di-jawa-tengah
+        'url': 'https://enamplus.liputan6.com/read/5033648/video-fakta-temuan-suspek-cacar-monyet-di-jawa-tengah',
        'info_dict': {
            'id': '7115874',
-            'ext': 'mp4',
-            'channel_id': '40172876',
-            'comment_count': int,
-            'uploader_id': 'liputan6',
-            'view_count': int,
-            'dislike_count': int,
-            'upload_date': '20220804',
-            'uploader': 'Liputan6.com',
            'display_id': 'fakta-temuan-suspek-cacar-monyet-di-jawa-tengah',
-            'channel': 'ENAM PLUS 165',
-            'timestamp': 1659605520,
+            'ext': 'mp4',
            'title': 'Fakta Temuan Suspek Cacar Monyet di Jawa Tengah',
-            'duration': 59,
-            'like_count': int,
-            'tags': ['monkeypox indonesia', 'cacar monyet menyebar', 'suspek cacar monyet di indonesia', 'fakta', 'hoax atau bukan?', 'jawa tengah'],
-            'thumbnail': 'https://thumbor.prod.vidiocdn.com/83PN-_BKm5sS7emLtRxl506MLqQ=/640x360/filters:quality(70)/vidio-web-prod-video/uploads/video/image/7115874/fakta-suspek-cacar-monyet-di-jawa-tengah-24555a.jpg',
-            'uploader_url': 'https://www.vidio.com/@liputan6',
            'description': 'md5:6d595a18d3b19ee378e335a6f288d5ac',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'duration': 59,
+            'uploader': 'liputan6',
+            'timestamp': 1659605693,
+            'upload_date': '20220804',
+            'uploader_id': '139',
+            'channel': 'enam-plus-165',
+            'channel_id': '40172876',
+            'channel_url': 'https://www.vidio.com/@liputan6/channels/40172876-enam-plus-165',
+            'tags': ['monkeypox-indonesia', 'cacar-monyet-menyebar', 'suspek-cacar-monyet-di-indonesia', 'fakta', 'hoax-atau-bukan', 'jawa-tengah'],
+            'uploader_url': 'https://www.vidio.com/@liputan6',
+            'live_status': 'not_live',
+            'genres': ['health'],
+            'season_id': '',
+            'season_name': '',
+            'age_limit': 13,
+            'comment_count': int,
+        },
+        'params': {
+            'getcomments': True,
        },
    }]

    def _real_extract(self, url):
-        match = self._match_valid_url(url).groupdict()
-        video_id, display_id = match.get('id'), match.get('display_id')
-        data = self._call_api('https://api.vidio.com/videos/' + video_id, display_id)
-        video = data['videos'][0]
-        title = video['title'].strip()
-        is_premium = video.get('is_premium')
+        video_id, display_id = self._match_valid_url(url).groups()

-        if is_premium:
-            sources = self._download_json(
-                f'https://www.vidio.com/interactions_stream.json?video_id={video_id}&type=videos',
-                display_id, note='Downloading premier API JSON')
-            if not (sources.get('source') or sources.get('source_dash')):
-                self.raise_login_required('This video is only available for registered users with the appropriate subscription')
+        webpage = self._download_webpage(url, video_id)
+        api_data = self._call_api(f'https://api.vidio.com/videos/{video_id}', display_id, 'Downloading API data')
+        interactions_stream = self._download_json(
+            'https://www.vidio.com/interactions_stream.json', video_id,
+            query={'video_id': video_id, 'type': 'videos'}, note='Downloading stream info',
+            errnote='Unable to download stream info')

-            formats, subs = [], {}
-            if sources.get('source'):
-                hls_formats, hls_subs = self._extract_m3u8_formats_and_subtitles(
-                    sources['source'], display_id, 'mp4', 'm3u8_native')
-                formats.extend(hls_formats)
-                subs.update(hls_subs)
-            if sources.get('source_dash'):  # TODO: Find video example with source_dash
-                dash_formats, dash_subs = self._extract_mpd_formats_and_subtitles(
-                    sources['source_dash'], display_id, 'dash')
-                formats.extend(dash_formats)
-                subs.update(dash_subs)
-        else:
-            hls_url = data['clips'][0]['hls_url']
-            formats, subs = self._extract_m3u8_formats_and_subtitles(
-                hls_url, display_id, 'mp4', 'm3u8_native')
+        attrs = extract_attributes(get_element_html_by_id(f'player-data-{video_id}', webpage))

-        get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {}
-        channel = get_first('channel')
-        user = get_first('user')
-        username = user.get('username')
-        get_count = lambda x: int_or_none(video.get('total_' + x))
+        if traverse_obj(attrs, ('data-drm-enabled', {lambda x: x == 'true'})):
+            self.report_drm(video_id)
+        if traverse_obj(attrs, ('data-geoblock', {lambda x: x == 'true'})):
+            self.raise_geo_restricted(
+                'This show isn\'t available in your country', countries=['ID'], metadata_available=True)
+
+        subtitles = dict(traverse_obj(attrs, ('data-subtitles', {json.loads}, ..., {
+            lambda x: (x['language'], [{'url': x['file']['url']}]),
+        })))
+        formats = []
+
+        # There are time-based strings in the playlist URL,
+        # so try the other URL iff no formats extracted from the prior one.
+
+        for m3u8_url in traverse_obj([
+                interactions_stream.get('source'),
+                attrs.get('data-vjs-clip-hls-url')], (..., {url_or_none})):
+            fmt, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, ext='mp4', m3u8_id='hls')
+            formats.extend(fmt)
+            self._merge_subtitles(subs, target=subtitles)
+            if fmt:
+                break
+
+        for mpd_url in traverse_obj([
+                interactions_stream.get('source_dash'),
+                attrs.get('data-vjs-clip-dash-url')], (..., {url_or_none})):
+            fmt, subs = self._extract_mpd_formats_and_subtitles(mpd_url, video_id, mpd_id='dash')
+            formats.extend(fmt)
+            self._merge_subtitles(subs, target=subtitles)
+            if fmt:
+                break
+
+        # TODO: extract also short previews of premier-exclusive videos from "attrs['data-content-preview-url']".
+
+        uploader = attrs.get('data-video-username')
+        uploader_url = f'https://www.vidio.com/@{uploader}'
+        channel = attrs.get('data-video-channel')
+        channel_id = attrs.get('data-video-channel-id')

        return {
            'id': video_id,
            'display_id': display_id,
-            'title': title,
-            'description': strip_or_none(video.get('description')),
-            'thumbnail': video.get('image_url_medium'),
-            'duration': int_or_none(video.get('duration')),
-            'like_count': get_count('likes'),
+            'title': (traverse_obj(api_data, ('videos', 0, 'title'))
+                      or attrs.get('data-video-title')
+                      or self._html_extract_title(webpage)),
+            'live_status': 'not_live',
            'formats': formats,
-            'subtitles': subs,
-            'uploader': user.get('name'),
-            'timestamp': parse_iso8601(video.get('created_at')),
-            'uploader_id': username,
-            'uploader_url': format_field(username, None, 'https://www.vidio.com/@%s'),
-            'channel': channel.get('name'),
-            'channel_id': str_or_none(channel.get('id')),
-            'view_count': get_count('view_count'),
-            'dislike_count': get_count('dislikes'),
-            'comment_count': get_count('comments'),
-            'tags': video.get('tag_list'),
+            'subtitles': subtitles,
+            'channel': channel,
+            'channel_id': channel_id,
+            'channel_url': f'{uploader_url}/channels/{channel_id}-{channel}',
+            'genres': traverse_obj(attrs, ('data-genres', {str_or_none}, {str.split(sep=',')}), default=[]),
+            'season_id': traverse_obj(attrs, ('data-season-id', {str_or_none})),
+            'season_name': traverse_obj(attrs, ('data-season-name', {str})),
+            'uploader': uploader,
+            'uploader_id': traverse_obj(attrs, ('data-video-user-id', {str_or_none})),
+            'uploader_url': uploader_url,
+            'thumbnail': traverse_obj(attrs, ('data-video-image-url', {url_or_none})),
+            'duration': traverse_obj(attrs, ('data-video-duration', {str_to_int})),
+            'description': traverse_obj(attrs, ('data-video-description', {str})),
+            'availability': self._availability(needs_premium=(attrs.get('data-access-type') == 'premium')),
+            'tags': traverse_obj(attrs, ('data-video-tags', {str_or_none}, {str.split(sep=',')}), default=[]),
+            'timestamp': traverse_obj(attrs, ('data-video-publish-date', {parse_iso8601(delimiter=' ')})),
+            'age_limit': (traverse_obj(attrs, ('data-adult', {lambda x: 18 if x == 'true' else 0}))
+                          or traverse_obj(attrs, ('data-content-rating-option', {remove_end(end=' or more')}, {str_to_int}))),
+            '__post_extractor': self.extract_comments(video_id),
        }

+    def _get_comments(self, video_id):
+        # TODO: extract replies under comments
+
+        def extract_comments(comments_data):
+            users = dict(traverse_obj(comments_data, ('included', ..., {
+                lambda x: (x['id'], {
+                    'author': x['attributes']['username'],
+                    'author_thumbnail': url_or_none(x['attributes']['avatar_url_big'] or x['attributes']['avatar_url_small']),
+                    'author_url': url_or_none(x['links']['self']),
+                }),
+            })))
+            yield from traverse_obj(comments_data, ('data', ..., {
+                'id': 'id',
+                'text': ('attributes', 'content'),
+                'timestamp': ('attributes', 'created_at', {parse_iso8601}),
+                'like_count': ('attributes', 'likes'),
+                'author_id': ('attributes', 'user_id'),
+            }, {lambda x: {**x, **users.get(x['author_id'])}}))
+
+        comment_page_url = f'https://api.vidio.com/videos/{video_id}/comments'
+        while comment_page_url:
+            comments_data = self._call_api(comment_page_url, video_id, 'Downloading comments')
+            comment_page_url = traverse_obj(comments_data, ('links', 'next', {url_or_none}))
+            yield from extract_comments(comments_data)
+

 class VidioPremierIE(VidioBaseIE):
    _VALID_URL = r'https?://(?:www\.)?vidio\.com/premier/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
@ -234,10 +448,43 @@ class VidioLiveIE(VidioBaseIE):
        'url': 'https://www.vidio.com/live/204-sctv',
        'info_dict': {
            'id': '204',
-            'title': 'SCTV',
-            'uploader': 'SCTV',
-            'uploader_id': 'sctv',
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'ext': 'mp4',
+            'title': r're:SCTV \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
+            'display_id': 'sctv',
+            'uploader': 'sctv',
+            'uploader_id': '4',
+            'uploader_url': 'https://www.vidio.com/@sctv',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'live_status': 'is_live',
+            'description': r're:^SCTV merupakan stasiun televisi nasional terkemuka di Indonesia.+',
+            'like_count': int,
+            'dislike_count': int,
+            'timestamp': 1461258000,
+            'upload_date': '20160421',
+            'tags': [],
+            'genres': [],
+            'age_limit': 13,
+        },
+    }, {
+        'url': 'https://vidio.com/live/733-trans-tv',
+        'info_dict': {
+            'id': '733',
+            'ext': 'mp4',
+            'title': r're:TRANS TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}',
+            'display_id': 'trans-tv',
+            'uploader': 'transtv',
+            'uploader_id': '551300',
+            'uploader_url': 'https://www.vidio.com/@transtv',
+            'thumbnail': r're:^https?://thumbor\.prod\.vidiocdn\.com/.+\.jpg$',
+            'live_status': 'is_live',
+            'description': r're:^Trans TV adalah stasiun televisi swasta Indonesia.+',
+            'like_count': int,
+            'dislike_count': int,
+            'timestamp': 1461355080,
+            'upload_date': '20160422',
+            'tags': [],
+            'genres': [],
+            'age_limit': 13,
        },
    }, {
        # Premier-exclusive livestream
@ -251,59 +498,60 @@ class VidioLiveIE(VidioBaseIE):

    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).groups()
-        stream_data = self._call_api(
-            f'https://www.vidio.com/api/livestreamings/{video_id}/detail', display_id)
-        stream_meta = stream_data['livestreamings'][0]
-        user = stream_data.get('users', [{}])[0]

-        title = stream_meta.get('title')
-        username = user.get('username')
+        webpage = self._download_webpage(url, video_id)
+        stream_meta = traverse_obj(self._call_api(
+            f'https://www.vidio.com/api/livestreamings/{video_id}/detail', video_id),
+            ('livestreamings', 0, {dict}), default={})
+        tokenized_playlist_urls = self._download_json(
+            f'https://www.vidio.com/live/{video_id}/tokens', video_id,
+            query={'type': 'dash'}, note='Downloading tokenized playlist',
+            errnote='Unable to download tokenized playlist', data=b'')
+        interactions_stream = self._download_json(
+            'https://www.vidio.com/interactions_stream.json', video_id,
+            query={'video_id': video_id, 'type': 'videos'}, note='Downloading stream info',
+            errnote='Unable to download stream info')
+
+        attrs = extract_attributes(get_element_html_by_id(f'player-data-{video_id}', webpage))
+
+        if traverse_obj(attrs, ('data-drm-enabled', {lambda x: x == 'true'})):
+            self.report_drm(video_id)
+        if traverse_obj(attrs, ('data-geoblock', {lambda x: x == 'true'})):
+            self.raise_geo_restricted(
+                'This show isn\'t available in your country', countries=['ID'], metadata_available=True)

        formats = []
-        if stream_meta.get('is_drm'):
-            if not self.get_param('allow_unplayable_formats'):
-                self.report_drm(video_id)
-        if stream_meta.get('is_premium'):
-            sources = self._download_json(
-                f'https://www.vidio.com/interactions_stream.json?video_id={video_id}&type=livestreamings',
-                display_id, note='Downloading premier API JSON')
-            if not (sources.get('source') or sources.get('source_dash')):
-                self.raise_login_required('This video is only available for registered users with the appropriate subscription')

-            if str_or_none(sources.get('source')):
-                token_json = self._download_json(
-                    f'https://www.vidio.com/live/{video_id}/tokens',
-                    display_id, note='Downloading HLS token JSON', data=b'')
-                formats.extend(self._extract_m3u8_formats(
-                    sources['source'] + '?' + token_json.get('token', ''), display_id, 'mp4', 'm3u8_native'))
-            if str_or_none(sources.get('source_dash')):
-                pass
-        else:
-            if stream_meta.get('stream_token_url'):
-                token_json = self._download_json(
-                    f'https://www.vidio.com/live/{video_id}/tokens',
-                    display_id, note='Downloading HLS token JSON', data=b'')
-                formats.extend(self._extract_m3u8_formats(
-                    stream_meta['stream_token_url'] + '?' + token_json.get('token', ''),
-                    display_id, 'mp4', 'm3u8_native'))
-            if stream_meta.get('stream_dash_url'):
-                pass
-            if stream_meta.get('stream_url'):
-                formats.extend(self._extract_m3u8_formats(
-                    stream_meta['stream_url'], display_id, 'mp4', 'm3u8_native'))
+        for m3u8_url in traverse_obj([
+                tokenized_playlist_urls.get('hls_url'),
+                interactions_stream.get('source')], (..., {url_or_none})):
+            formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='hls'))
+
+        for mpd_url in traverse_obj([
+                tokenized_playlist_urls.get('dash_url'),
+                interactions_stream.get('source_dash')], (..., {url_or_none})):
+            formats.extend(self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash'))
+
+        uploader = attrs.get('data-video-username')
+        uploader_url = f'https://www.vidio.com/@{uploader}'

        return {
            'id': video_id,
            'display_id': display_id,
-            'title': title,
-            'is_live': True,
-            'description': strip_or_none(stream_meta.get('description')),
-            'thumbnail': stream_meta.get('image'),
+            'title': attrs.get('data-video-title'),
+            'live_status': 'is_live',
+            'formats': formats,
+            'genres': traverse_obj(attrs, ('data-genres', {str_or_none}, {str.split(sep=',')}), default=[]),
+            'uploader': uploader,
+            'uploader_id': traverse_obj(attrs, ('data-video-user-id', {str_or_none})),
+            'uploader_url': uploader_url,
+            'thumbnail': traverse_obj(attrs, ('data-video-image-url', {url_or_none})),
+            'description': traverse_obj(attrs, ('data-video-description', {str})),
+            'availability': self._availability(needs_premium=(attrs.get('data-access-type') == 'premium')),
+            'tags': traverse_obj(attrs, ('data-video-tags', {str_or_none}, {str.split(sep=',')}), default=[]),
+            'age_limit': (traverse_obj(attrs, ('data-adult', {lambda x: 18 if x == 'true' else 0}))
+                          or traverse_obj(attrs, ('data-content-rating-option', {remove_end(end=' or more')}, {str_to_int}))),
            'like_count': int_or_none(stream_meta.get('like')),
            'dislike_count': int_or_none(stream_meta.get('dislike')),
-            'formats': formats,
-            'uploader': user.get('name'),
            'timestamp': parse_iso8601(stream_meta.get('start_time')),
-            'uploader_id': username,
-            'uploader_url': format_field(username, None, 'https://www.vidio.com/@%s'),
        }
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -216,7 +216,7 @@ def partial_application(func):
    sig = inspect.signature(func)
    required_args = [
        param.name for param in sig.parameters.values()
-        if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
+        if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
        if param.default is inspect.Parameter.empty
    ]

@ -4837,7 +4837,6 @@ def number_of_digits(number):
    return len('%d' % number)


-@partial_application
 def join_nonempty(*values, delim='-', from_dict=None):
    if from_dict is not None:
        values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
--- a/yt_dlp/utils/traversal.py
+++ b/yt_dlp/utils/traversal.py
@ -332,14 +332,14 @@ class _RequiredError(ExtractorError):


@typing.overload
-def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
+def subs_list_to_dict(*, lang: str | None = 'und', ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...


@typing.overload
-def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ...
+def subs_list_to_dict(subs: list[dict] | None, /, *, lang: str | None = 'und', ext: str | None = None) -> dict[str, list[dict]]: ...


-def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
+def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None):
    """
    Convert subtitles from a traversal into a subtitle dict.
    The path should have an `all` immediately before this function.
@ -352,7 +352,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
    `quality`  The sort order for each subtitle
    """
    if subs is None:
-        return functools.partial(subs_list_to_dict, ext=ext)
+        return functools.partial(subs_list_to_dict, lang=lang, ext=ext)

    result = collections.defaultdict(list)

@ -360,9 +360,15 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
        if not url_or_none(sub.get('url')) and not sub.get('data'):
            continue
        sub_id = sub.pop('id', None)
-        if sub_id is None:
+        if not isinstance(sub_id, str):
+            if not lang:
                continue
-        if ext is not None and not sub.get('ext'):
+            sub_id = lang
+        sub_ext = sub.get('ext')
+        if not isinstance(sub_ext, str):
+            if not ext:
+                sub.pop('ext', None)
+            else:
                sub['ext'] = ext
        result[sub_id].append(sub)
    result = dict(result)
@ -452,9 +458,9 @@ def trim_str(*, start=None, end=None):
    return trim


-def unpack(func):
+def unpack(func, **kwargs):
    @functools.wraps(func)
-    def inner(items, **kwargs):
+    def inner(items):
        return func(*items, **kwargs)

    return inner
Author	SHA1	Message	Date
Mozi	6fb342c163	Merge `8779a8897c` into `c699bafc50`	2024-11-16 07:40:24 +00:00
Mozi	8779a8897c	simplify statements in traversal	2024-11-16 07:39:59 +00:00
Mozi	88de6d0c2d	merge 'master'	2024-11-16 07:22:46 +00:00
bashonly	c699bafc50	[ie/soop] Fix thumbnail extraction (#11545 ) Closes #11537 Authored by: bashonly	2024-11-15 22:51:55 +00:00
bashonly	eb64ae7d5d	[ie] Allow `ext` override for thumbnails (#11545 ) Authored by: bashonly	2024-11-15 22:51:55 +00:00
Simon Sawicki	c014fbcddc	[utils] `subs_list_to_dict`: Add `lang` default parameter (#11508 ) Authored by: Grub4K	2024-11-15 23:25:52 +01:00
Simon Sawicki	39d79c9b9c	[utils] Fix `join_nonempty`, add `**kwargs` to `unpack` (#11559 ) Authored by: Grub4K	2024-11-15 22:06:15 +01:00
Jackson Humphrey	f2a4983df7	[ie/archive.org] Fix comments extraction (#11527 ) Closes #11526 Authored by: jshumphrey	2024-11-12 23:26:18 +00:00
bashonly	bacc31b05a	[ie/facebook] Fix formats extraction (#11513 ) Closes #11497 Authored by: bashonly	2024-11-12 23:23:10 +00:00
Mozi	628ce197eb	merge 'master'	2024-10-28 00:37:58 +00:00
Mozi	c0aa2e8160	fix usage of 'self._merge_subtitles'	2024-10-28 00:37:42 +00:00
Mozi	4b00360b4e	[ie/vidio:live] the code I wrote does not seem to work. let's rewrite it Those two URLs of Premier-exclusive livestreams are still not working!	2024-09-21 12:28:25 +00:00
Mozi	8155ed770b	merge branch 'master'	2024-09-21 10:08:41 +00:00
Mozi	20c66ec13e	[ie/vidio] Fix login; use new API; check DRM; extract comments	2024-09-21 10:07:45 +00:00
Mozi	3bb739f188	[ie/vidio:live] Add DASH support; use new API	2024-09-01 16:56:51 +00:00