2024-11-27 09:41:26 +01:00
7 changed files with 72 additions and 127 deletions
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -140,8 +140,6 @@ class TestFormatSelection(unittest.TestCase):
        test('example-with-dashes', 'example-with-dashes')
        test('all', '2', '47', '45', 'example-with-dashes', '35')
        test('mergeall', '2+47+45+example-with-dashes+35', multi=True)
-        # See: https://github.com/yt-dlp/yt-dlp/pulls/8797
-        test('7_a/worst', '35')

    def test_format_selection_audio(self):
        formats = [
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -2465,16 +2465,9 @@ class YoutubeDL:
                return selector_function(ctx_copy)
            return final_selector

-        # HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid
-        #       Prefix numbers with random letters to avoid it being classified as a number
-        #       See: https://github.com/yt-dlp/yt-dlp/pulls/8797
-        # TODO: Implement parser not reliant on tokenize.tokenize
-        prefix = ''.join(random.choices(string.ascii_letters, k=32))
-        stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
+        stream = io.BytesIO(format_spec.encode())
        try:
-            tokens = list(_remove_unused_ops(
-                token._replace(string=token.string.replace(prefix, ''))
-                for token in tokenize.tokenize(stream.readline)))
+            tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
        except tokenize.TokenError:
            raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))

--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@ -292,7 +292,7 @@ class ARDIE(InfoExtractor):
    _TESTS = [{
        # available till 7.12.2023
        'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
-        'md5': '94812e6438488fb923c361a44469614b',
+        'md5': 'a438f671e87a7eba04000336a119ccc4',
        'info_dict': {
            'id': 'maischberger-video-424',
            'display_id': 'maischberger-video-424',
@ -403,25 +403,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
    _VALID_URL = r'''(?x)https://
        (?:(?:beta|www)\.)?ardmediathek\.de/
        (?:(?P<client>[^/]+)/)?
-        (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
+        (?:player|live|video|(?P<playlist>sendung|sammlung))/
        (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
        (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
        (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''

    _TESTS = [{
-        'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
-        'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
+        'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI',
+        'md5': '3fd5fead7a370a819341129c8d713136',
        'info_dict': {
-            'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
-            'id': '12939099',
-            'title': 'Liebe auf vier Pfoten',
-            'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
-            'duration': 5222,
-            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
-            'timestamp': 1701343800,
-            'upload_date': '20231130',
+            'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen',
+            'id': '12172961',
+            'title': 'Wolfsland - Die traurigen Schwestern',
+            'description': r're:^Als der Polizeiobermeister Raaben',
+            'duration': 5241,
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957',
+            'timestamp': 1670710500,
+            'upload_date': '20221210',
            'ext': 'mp4',
-            'episode': 'Liebe auf vier Pfoten',
+            'age_limit': 12,
+            'episode': 'Wolfsland - Die traurigen Schwestern',
            'series': 'Filme im MDR'
        },
    }, {
@ -453,7 +454,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
            'duration': 915,
            'episode': 'tagesschau, 20:00 Uhr',
            'series': 'tagesschau',
-            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49',
        },
    }, {
        'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -474,10 +475,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
        # playlist of type 'sendung'
        'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
        'only_matching': True,
-    }, {
-        # playlist of type 'serie'
-        'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
-        'only_matching': True,
    }, {
        # playlist of type 'sammlung'
        'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
@ -490,11 +487,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
        'only_matching': True,
    }]

-    def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
+    def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
        """ Query the ARD server for playlist information
        and returns the data in "raw" format """
-        assert mode in ('sendung', 'serie', 'sammlung')
-        if mode in ('sendung', 'serie'):
+        if mode == 'sendung':
            graphQL = json.dumps({
                'query': '''{
                    showPage(
@ -511,7 +507,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
                            links { target { id href title } }
                            type
                        }
-                    }}''' % (client, playlist_id, page_number),
+                    }}''' % (client, playlist_id, pageNumber),
            }).encode()
        else:  # mode == 'sammlung'
            graphQL = json.dumps({
@ -532,7 +528,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
                                type
                            }
                        }
-                    }}''' % (client, playlist_id, page_number),
+                    }}''' % (client, playlist_id, pageNumber),
            }).encode()
        # Ressources for ARD graphQL debugging:
        # https://api-test.ardmediathek.de/public-gateway
@ -542,7 +538,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
            data=graphQL,
            headers={'Content-Type': 'application/json'})['data']
        # align the structure of the returned data:
-        if mode in ('sendung', 'serie'):
+        if mode == 'sendung':
            show_page = show_page['showPage']
        else:  # mode == 'sammlung'
            show_page = show_page['morePage']['widget']
@ -550,12 +546,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):

    def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
        """ Collects all playlist entries and returns them as info dict.
-        Supports playlists of mode 'sendung', 'serie', and 'sammlung',
-        as well as nested playlists. """
+        Supports playlists of mode 'sendung' and 'sammlung', and also nested
+        playlists. """
        entries = []
        pageNumber = 0
        while True:  # iterate by pageNumber
-            show_page = self._ARD_load_playlist_snippet(
+            show_page = self._ARD_load_playlist_snipped(
                playlist_id, display_id, client, mode, pageNumber)
            for teaser in show_page['teasers']:  # process playlist items
                if '/compilation/' in teaser['links']['target']['href']:
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@ -52,7 +52,7 @@ class FacebookIE(InfoExtractor):
                            )\?(?:.*?)(?:v|video_id|story_fbid)=|
                            [^/]+/videos/(?:[^/]+/)?|
                            [^/]+/posts/|
-                            groups/[^/]+/(?:permalink|posts)/|
+                            groups/[^/]+/permalink/|
                            watchparty/
                        )|
                    facebook:
@ -232,21 +232,6 @@ class FacebookIE(InfoExtractor):
            'uploader_id': '100013949973717',
        },
        'skip': 'Requires logging in',
-    }, {
-        # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
-        'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
-        'info_dict': {
-            'id': '1569199726448814',
-            'ext': 'mp4',
-            'title': 'Pence MUST GO!',
-            'description': 'Vickie Gentry shared a memory.',
-            'timestamp': 1511548260,
-            'upload_date': '20171124',
-            'uploader': 'Vickie Gentry',
-            'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
-            'thumbnail': r're:^https?://.*',
-            'duration': 148.435,
-        },
    }, {
        'url': 'https://www.facebook.com/video.php?v=10204634152394104',
        'only_matching': True,
@ -627,11 +612,9 @@ class FacebookIE(InfoExtractor):
                nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
                attachments = traverse_obj(nodes, (
                    ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
-                    ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
-                    'attachment', {dict}))
+                    ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or []
                for attachment in attachments:
-                    ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
-                                      ('target', 'attachments', ..., 'styles', 'attachment', {dict}))
+                    ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
                    for n in ns:
                        parse_attachment(n)
                    parse_attachment(attachment)
@ -654,7 +637,7 @@ class FacebookIE(InfoExtractor):
                if len(entries) > 1:
                    return self.playlist_result(entries, video_id)

-                video_info = entries[0] if entries else {'id': video_id}
+                video_info = entries[0]
                webpage_info = extract_metadata(webpage)
                # honor precise duration in video info
                if video_info.get('duration'):
--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@ -10,7 +10,6 @@ from ..utils import (
    ExtractorError,
    decode_base_n,
    encode_base_n,
-    filter_dict,
    float_or_none,
    format_field,
    get_element_by_attribute,
@ -704,31 +703,28 @@ class InstagramStoryIE(InstagramBaseIE):
        user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False)
        if not user_info:
            self.raise_login_required('This content is unreachable')
+        user_id = user_info.get('id')

-        user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str)
        story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}'
-        if not story_info_url:  # user id is only mandatory for non-highlights
-            raise ExtractorError('Unable to extract user id')
-
        videos = traverse_obj(self._download_json(
            f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}',
            story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels')
        if not videos:
            self.raise_login_required('You need to log in to access this content')

-        full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name'))
+        full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name'))
        story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title'))
        if not story_title:
            story_title = f'Story by {username}'

-        highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items'))
+        highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items'))
        info_data = []
        for highlight in highlights:
            highlight_data = self._extract_product(highlight)
            if highlight_data.get('formats'):
                info_data.append({
+                    **highlight_data,
                    'uploader': full_name,
                    'uploader_id': user_id,
-                    **filter_dict(highlight_data),
                })
        return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title)
--- a/yt_dlp/extractor/litv.py
+++ b/yt_dlp/extractor/litv.py
@ -6,7 +6,6 @@ from ..utils import (
    int_or_none,
    smuggle_url,
    traverse_obj,
-    try_call,
    unsmuggle_url,
 )

@ -97,22 +96,13 @@ class LiTVIE(InfoExtractor):
            r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
            webpage, 'video data', default='{}'), video_id)
        if not video_data:
-            payload = {'assetId': program_info['assetId']}
-            puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
-            if puid:
-                payload.update({
-                    'type': 'auth',
-                    'puid': puid,
-                })
-                endpoint = 'getUrl'
-            else:
-                payload.update({
-                    'watchDevices': program_info['watchDevices'],
-                    'contentType': program_info['contentType'],
-                })
-                endpoint = 'getMainUrlNoAuth'
+            payload = {
+                'assetId': program_info['assetId'],
+                'watchDevices': program_info['watchDevices'],
+                'contentType': program_info['contentType'],
+            }
            video_data = self._download_json(
-                f'https://www.litv.tv/vod/ajax/{endpoint}', video_id,
+                'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id,
                data=json.dumps(payload).encode('utf-8'),
                headers={'Content-Type': 'application/json'})

--- a/yt_dlp/extractor/twitter.py
+++ b/yt_dlp/extractor/twitter.py
@ -10,7 +10,6 @@ from ..compat import (
    compat_urllib_parse_unquote,
    compat_urllib_parse_urlparse,
 )
-from ..networking.exceptions import HTTPError
 from ..utils import (
    ExtractorError,
    dict_get,
@ -1318,51 +1317,41 @@ class TwitterIE(TwitterBaseIE):
            }
        }

-    def _call_syndication_api(self, twid):
-        self.report_warning(
-            'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
-        status = self._download_json(
-            'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
-            headers={'User-Agent': 'Googlebot'}, query={
-                'id': twid,
-                # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
-                'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
-            })
-        if not status:
-            raise ExtractorError('Syndication endpoint returned empty JSON response')
-        # Transform the result so its structure matches that of legacy/graphql
-        media = []
-        for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
-            detail['id_str'] = traverse_obj(detail, (
-                'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
-            media.append(detail)
-        status['extended_entities'] = {'media': media}
-
-        return status
-
    def _extract_status(self, twid):
-        if self._selected_api not in ('graphql', 'legacy', 'syndication'):
-            raise ExtractorError(f'{self._selected_api!r} is not a valid API selection', expected=True)
+        if self.is_logged_in or self._selected_api == 'graphql':
+            status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)

-        try:
-            if self.is_logged_in or self._selected_api == 'graphql':
-                status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
-            elif self._selected_api == 'legacy':
-                status = self._call_api(f'statuses/show/{twid}.json', twid, {
-                    'cards_platform': 'Web-12',
-                    'include_cards': 1,
-                    'include_reply_count': 1,
-                    'include_user_entities': 0,
-                    'tweet_mode': 'extended',
+        elif self._selected_api == 'legacy':
+            status = self._call_api(f'statuses/show/{twid}.json', twid, {
+                'cards_platform': 'Web-12',
+                'include_cards': 1,
+                'include_reply_count': 1,
+                'include_user_entities': 0,
+                'tweet_mode': 'extended',
+            })
+
+        elif self._selected_api == 'syndication':
+            self.report_warning(
+                'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
+            status = self._download_json(
+                'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
+                headers={'User-Agent': 'Googlebot'}, query={
+                    'id': twid,
+                    # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
+                    'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
                })
-        except ExtractorError as e:
-            if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
-                raise
-            self.report_warning('Rate-limit exceeded; falling back to syndication endpoint')
-            status = self._call_syndication_api(twid)
+            if not status:
+                raise ExtractorError('Syndication endpoint returned empty JSON response')
+            # Transform the result so its structure matches that of legacy/graphql
+            media = []
+            for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
+                detail['id_str'] = traverse_obj(detail, (
+                    'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
+                media.append(detail)
+            status['extended_entities'] = {'media': media}

-        if self._selected_api == 'syndication':
-            status = self._call_syndication_api(twid)
+        else:
+            raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)

        return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}

@ -1427,8 +1416,8 @@ class TwitterIE(TwitterBaseIE):
                'thumbnails': thumbnails,
                'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})),  # No longer available
                'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
-                # Prioritize m3u8 formats for compat, see https://github.com/yt-dlp/yt-dlp/issues/8117
-                '_format_sort_fields': ('res', 'proto:m3u8', 'br', 'size'),  # http format codec is unknown
+                # The codec of http formats are unknown
+                '_format_sort_fields': ('res', 'br', 'size', 'proto'),
            }

        def extract_from_card_info(card):