Merge branch 'yt-dlp:master' into master

[ie/Facebook] Fix Memories extraction (#8681 )
- Support group /posts/ URLs - Raise a proper error message if no formats are found Closes #8669 Authored by: kclauhk
2024-11-27 09:41:26 +01:00 · 2023-12-25 07:46:11 +05:30 · 2023-12-24 23:43:35 +01:00 · 2023-12-24 23:38:21 +01:00 · 2023-12-24 22:09:01 +01:00 · 2023-12-24 16:41:28 +00:00
7 changed files with 127 additions and 72 deletions
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -140,6 +140,8 @@ class TestFormatSelection(unittest.TestCase):
        test('example-with-dashes', 'example-with-dashes')
        test('all', '2', '47', '45', 'example-with-dashes', '35')
        test('mergeall', '2+47+45+example-with-dashes+35', multi=True)
        # See: https://github.com/yt-dlp/yt-dlp/pulls/8797
        test('7_a/worst', '35')
    def test_format_selection_audio(self):
        formats = [
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -2465,9 +2465,16 @@ class YoutubeDL:
                return selector_function(ctx_copy)
            return final_selector
-        stream = io.BytesIO(format_spec.encode())
+        # HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid
        #       Prefix numbers with random letters to avoid it being classified as a number
        #       See: https://github.com/yt-dlp/yt-dlp/pulls/8797
        # TODO: Implement parser not reliant on tokenize.tokenize
        prefix = ''.join(random.choices(string.ascii_letters, k=32))
        stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
        try:
-            tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
+            tokens = list(_remove_unused_ops(
                token._replace(string=token.string.replace(prefix, ''))
                for token in tokenize.tokenize(stream.readline)))
        except tokenize.TokenError:
            raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@ -292,7 +292,7 @@ class ARDIE(InfoExtractor):
    _TESTS = [{
        # available till 7.12.2023
        'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
-        'md5': 'a438f671e87a7eba04000336a119ccc4',
+        'md5': '94812e6438488fb923c361a44469614b',
        'info_dict': {
            'id': 'maischberger-video-424',
            'display_id': 'maischberger-video-424',
@ -403,26 +403,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
    _VALID_URL = r'''(?x)https://
        (?:(?:beta|www)\.)?ardmediathek\.de/
        (?:(?P<client>[^/]+)/)?
-        (?:player|live|video|(?P<playlist>sendung|sammlung))/
+        (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
        (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
        (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
        (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
    _TESTS = [{
-        'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI',
+        'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
-        'md5': '3fd5fead7a370a819341129c8d713136',
+        'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
        'info_dict': {
-            'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen',
+            'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
-            'id': '12172961',
+            'id': '12939099',
-            'title': 'Wolfsland - Die traurigen Schwestern',
+            'title': 'Liebe auf vier Pfoten',
-            'description': r're:^Als der Polizeiobermeister Raaben',
+            'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
-            'duration': 5241,
+            'duration': 5222,
-            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957',
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
-            'timestamp': 1670710500,
+            'timestamp': 1701343800,
-            'upload_date': '20221210',
+            'upload_date': '20231130',
            'ext': 'mp4',
-            'age_limit': 12,
+            'episode': 'Liebe auf vier Pfoten',
            'episode': 'Wolfsland - Die traurigen Schwestern',
            'series': 'Filme im MDR'
        },
    }, {
@ -454,7 +453,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
            'duration': 915,
            'episode': 'tagesschau, 20:00 Uhr',
            'series': 'tagesschau',
-            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49',
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
        },
    }, {
        'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -475,6 +474,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
        # playlist of type 'sendung'
        'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
        'only_matching': True,
    }, {
        # playlist of type 'serie'
        'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
        'only_matching': True,
    }, {
        # playlist of type 'sammlung'
        'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
@ -487,10 +490,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
        'only_matching': True,
    }]
-    def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
+    def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
        """ Query the ARD server for playlist information
        and returns the data in "raw" format """
-        if mode == 'sendung':
+        assert mode in ('sendung', 'serie', 'sammlung')
        if mode in ('sendung', 'serie'):
            graphQL = json.dumps({
                'query': '''{
                    showPage(
@ -507,7 +511,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
                            links { target { id href title } }
                            type
                        }
-                    }}''' % (client, playlist_id, pageNumber),
+                    }}''' % (client, playlist_id, page_number),
            }).encode()
        else:  # mode == 'sammlung'
            graphQL = json.dumps({
@ -528,7 +532,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
                                type
                            }
                        }
-                    }}''' % (client, playlist_id, pageNumber),
+                    }}''' % (client, playlist_id, page_number),
            }).encode()
        # Ressources for ARD graphQL debugging:
        # https://api-test.ardmediathek.de/public-gateway
@ -538,7 +542,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
            data=graphQL,
            headers={'Content-Type': 'application/json'})['data']
        # align the structure of the returned data:
-        if mode == 'sendung':
+        if mode in ('sendung', 'serie'):
            show_page = show_page['showPage']
        else:  # mode == 'sammlung'
            show_page = show_page['morePage']['widget']
@ -546,12 +550,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
    def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
        """ Collects all playlist entries and returns them as info dict.
-        Supports playlists of mode 'sendung' and 'sammlung', and also nested
+        Supports playlists of mode 'sendung', 'serie', and 'sammlung',
-        playlists. """
+        as well as nested playlists. """
        entries = []
        pageNumber = 0
        while True:  # iterate by pageNumber
-            show_page = self._ARD_load_playlist_snipped(
+            show_page = self._ARD_load_playlist_snippet(
                playlist_id, display_id, client, mode, pageNumber)
            for teaser in show_page['teasers']:  # process playlist items
                if '/compilation/' in teaser['links']['target']['href']:
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@ -52,7 +52,7 @@ class FacebookIE(InfoExtractor):
                            )\?(?:.*?)(?:v|video_id|story_fbid)=|
                            [^/]+/videos/(?:[^/]+/)?|
                            [^/]+/posts/|
-                            groups/[^/]+/permalink/|
+                            groups/[^/]+/(?:permalink|posts)/|
                            watchparty/
                        )|
                    facebook:
@ -232,6 +232,21 @@ class FacebookIE(InfoExtractor):
            'uploader_id': '100013949973717',
        },
        'skip': 'Requires logging in',
    }, {
        # data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
        'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
        'info_dict': {
            'id': '1569199726448814',
            'ext': 'mp4',
            'title': 'Pence MUST GO!',
            'description': 'Vickie Gentry shared a memory.',
            'timestamp': 1511548260,
            'upload_date': '20171124',
            'uploader': 'Vickie Gentry',
            'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
            'thumbnail': r're:^https?://.*',
            'duration': 148.435,
        },
    }, {
        'url': 'https://www.facebook.com/video.php?v=10204634152394104',
        'only_matching': True,
@ -612,9 +627,11 @@ class FacebookIE(InfoExtractor):
                nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
                attachments = traverse_obj(nodes, (
                    ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
-                    ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or []
+                    ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
                    'attachment', {dict}))
                for attachment in attachments:
-                    ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
+                    ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
                                      ('target', 'attachments', ..., 'styles', 'attachment', {dict}))
                    for n in ns:
                        parse_attachment(n)
                    parse_attachment(attachment)
@ -637,7 +654,7 @@ class FacebookIE(InfoExtractor):
                if len(entries) > 1:
                    return self.playlist_result(entries, video_id)
-                video_info = entries[0]
+                video_info = entries[0] if entries else {'id': video_id}
                webpage_info = extract_metadata(webpage)
                # honor precise duration in video info
                if video_info.get('duration'):
--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@ -10,6 +10,7 @@ from ..utils import (
    ExtractorError,
    decode_base_n,
    encode_base_n,
    filter_dict,
    float_or_none,
    format_field,
    get_element_by_attribute,
@ -703,28 +704,31 @@ class InstagramStoryIE(InstagramBaseIE):
        user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False)
        if not user_info:
            self.raise_login_required('This content is unreachable')
        user_id = user_info.get('id')
        user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str)
        story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}'
        if not story_info_url:  # user id is only mandatory for non-highlights
            raise ExtractorError('Unable to extract user id')
        videos = traverse_obj(self._download_json(
            f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}',
            story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels')
        if not videos:
            self.raise_login_required('You need to log in to access this content')
-        full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name'))
+        full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name'))
        story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title'))
        if not story_title:
            story_title = f'Story by {username}'
-        highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items'))
+        highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items'))
        info_data = []
        for highlight in highlights:
            highlight_data = self._extract_product(highlight)
            if highlight_data.get('formats'):
                info_data.append({
                    **highlight_data,
                    'uploader': full_name,
                    'uploader_id': user_id,
                    **filter_dict(highlight_data),
                })
        return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title)
--- a/yt_dlp/extractor/litv.py
+++ b/yt_dlp/extractor/litv.py
@ -6,6 +6,7 @@ from ..utils import (
    int_or_none,
    smuggle_url,
    traverse_obj,
    try_call,
    unsmuggle_url,
 )
@ -96,13 +97,22 @@ class LiTVIE(InfoExtractor):
            r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
            webpage, 'video data', default='{}'), video_id)
        if not video_data:
-            payload = {
+            payload = {'assetId': program_info['assetId']}
-                'assetId': program_info['assetId'],
+            puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
-                'watchDevices': program_info['watchDevices'],
+            if puid:
-                'contentType': program_info['contentType'],
+                payload.update({
-            }
+                    'type': 'auth',
                    'puid': puid,
                })
                endpoint = 'getUrl'
            else:
                payload.update({
                    'watchDevices': program_info['watchDevices'],
                    'contentType': program_info['contentType'],
                })
                endpoint = 'getMainUrlNoAuth'
            video_data = self._download_json(
-                'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id,
+                f'https://www.litv.tv/vod/ajax/{endpoint}', video_id,
                data=json.dumps(payload).encode('utf-8'),
                headers={'Content-Type': 'application/json'})
--- a/yt_dlp/extractor/twitter.py
+++ b/yt_dlp/extractor/twitter.py
@ -10,6 +10,7 @@ from ..compat import (
    compat_urllib_parse_unquote,
    compat_urllib_parse_urlparse,
 )
 from ..networking.exceptions import HTTPError
 from ..utils import (
    ExtractorError,
    dict_get,
@ -1317,41 +1318,51 @@ class TwitterIE(TwitterBaseIE):
            }
        }
-    def _extract_status(self, twid):
+    def _call_syndication_api(self, twid):
-        if self.is_logged_in or self._selected_api == 'graphql':
+        self.report_warning(
-            status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
+            'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
-
+        status = self._download_json(
-        elif self._selected_api == 'legacy':
+            'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
-            status = self._call_api(f'statuses/show/{twid}.json', twid, {
+            headers={'User-Agent': 'Googlebot'}, query={
-                'cards_platform': 'Web-12',
+                'id': twid,
-                'include_cards': 1,
+                # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
-                'include_reply_count': 1,
+                'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
                'include_user_entities': 0,
                'tweet_mode': 'extended',
            })
        if not status:
            raise ExtractorError('Syndication endpoint returned empty JSON response')
        # Transform the result so its structure matches that of legacy/graphql
        media = []
        for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
            detail['id_str'] = traverse_obj(detail, (
                'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
            media.append(detail)
        status['extended_entities'] = {'media': media}
-        elif self._selected_api == 'syndication':
+        return status
-            self.report_warning(
+
-                'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
+    def _extract_status(self, twid):
-            status = self._download_json(
+        if self._selected_api not in ('graphql', 'legacy', 'syndication'):
-                'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
+            raise ExtractorError(f'{self._selected_api!r} is not a valid API selection', expected=True)
-                headers={'User-Agent': 'Googlebot'}, query={
+
-                    'id': twid,
+        try:
-                    # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
+            if self.is_logged_in or self._selected_api == 'graphql':
-                    'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
+                status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
            elif self._selected_api == 'legacy':
                status = self._call_api(f'statuses/show/{twid}.json', twid, {
                    'cards_platform': 'Web-12',
                    'include_cards': 1,
                    'include_reply_count': 1,
                    'include_user_entities': 0,
                    'tweet_mode': 'extended',
                })
-            if not status:
+        except ExtractorError as e:
-                raise ExtractorError('Syndication endpoint returned empty JSON response')
+            if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
-            # Transform the result so its structure matches that of legacy/graphql
+                raise
-            media = []
+            self.report_warning('Rate-limit exceeded; falling back to syndication endpoint')
-            for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
+            status = self._call_syndication_api(twid)
                detail['id_str'] = traverse_obj(detail, (
                    'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
                media.append(detail)
            status['extended_entities'] = {'media': media}
-        else:
+        if self._selected_api == 'syndication':
-            raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)
+            status = self._call_syndication_api(twid)
        return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}
@ -1416,8 +1427,8 @@ class TwitterIE(TwitterBaseIE):
                'thumbnails': thumbnails,
                'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})),  # No longer available
                'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
-                # The codec of http formats are unknown
+                # Prioritize m3u8 formats for compat, see https://github.com/yt-dlp/yt-dlp/issues/8117
-                '_format_sort_fields': ('res', 'br', 'size', 'proto'),
+                '_format_sort_fields': ('res', 'proto:m3u8', 'br', 'size'),  # http format codec is unknown
            }
        def extract_from_card_info(card):
Author	SHA1	Message	Date
alien-developers	91ca560381	Merge branch 'yt-dlp:master' into master	2023-12-25 07:46:11 +05:30
kclauhk	c39358a54b	[ie/Facebook] Fix Memories extraction (#8681 ) - Support group /posts/ URLs - Raise a proper error message if no formats are found Closes #8669 Authored by: kclauhk	2023-12-24 23:43:35 +01:00
Lars Strojny	1f8bd8eba8	[ie/ARDBetaMediathek] Fix series extraction (#8687 ) Closes #7666 Authored by: lstrojny	2023-12-24 23:38:21 +01:00
Simon Sawicki	00cdda4f6f	[core] Fix format selection parse error for CPython 3.12 (#8797 ) Authored by: Grub4K	2023-12-24 22:09:01 +01:00
bashonly	116c268438	[ie/twitter] Work around API rate-limit (#8825 ) Closes #8762 Authored by: bashonly	2023-12-24 16:41:28 +00:00
bashonly	e7d22348e7	[ie/twitter] Prioritize m3u8 formats (#8826 ) Closes #8117 Authored by: bashonly	2023-12-24 16:40:50 +00:00
bashonly	50eaea9fd7	[ie/instagram] Fix stories extraction (#8843 ) Closes #8290 Authored by: bashonly	2023-12-24 16:40:03 +00:00
bashonly	f45c4efcd9	[ie/litv] Fix premium content extraction (#8842 ) Closes #8654 Authored by: bashonly	2023-12-24 16:33:16 +00:00