Make sub parsing non fatal

Remove lyd tests
Fix typo
2024-09-22 02:11:26 +02:00 · 2023-11-07 05:20:45 +01:00 · 2023-11-07 04:59:51 +01:00 · 2023-11-07 04:58:33 +01:00 · 2023-11-07 04:56:40 +01:00
1 changed files with 42 additions and 79 deletions
--- a/yt_dlp/extractor/drtv.py
+++ b/yt_dlp/extractor/drtv.py
@ -19,7 +19,7 @@ class DRTVIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
-                            (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?P<radio>radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*|
+                            (?:www\.)?dr\.dk/tv/se(?:/ondemand)?/(?:[^/]+/)*|
                            (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/
                        )
                        (?P<id>[\da-z_-]+)
@ -48,22 +48,6 @@ class DRTVIE(InfoExtractor):
        },
        'expected_warnings': ['Unable to download f4m manifest'],
        'skip': 'this video has been removed',
-    }, {
-        # embed
-        'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
-        'info_dict': {
-            'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6',
-            'ext': 'mp4',
-            'title': 'christiania pusher street ryddes drdkrjpo',
-            'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
-            'timestamp': 1472800279,
-            'upload_date': '20160902',
-            'duration': 131.4,
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'expected_warnings': ['Unable to download f4m manifest'],
    }, {
        # with SignLanguage formats
        'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
@ -82,29 +66,25 @@ class DRTVIE(InfoExtractor):
            'season': 'Historien om Danmark',
            'series': 'Historien om Danmark',
        },
-        'params': {
-            'skip_download': True,
-        },
-    }, {
-        'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9',
-        'only_matching': True,
+        'skip': 'this video has been removed',
    }, {
        'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769',
        'info_dict': {
            'id': '00951930010',
            'ext': 'mp4',
-            'title': 'Bonderøven 2019 (1:8)',
-            'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd',
-            'timestamp': 1654856100,
-            'upload_date': '20220610',
-            'duration': 2576.6,
-            'season': 'Bonderøven 2019',
-            'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5',
+            'title': 'Frank & Kastaniegaarden',
+            'description': 'md5:974e1780934cf3275ef10280204bccb0',
+            'release_timestamp': 1546545600,
+            'release_date': '20190103',
+            'duration': 2576,
+            'season': 'Frank & Kastaniegaarden',
+            'season_id': '67125',
            'release_year': 2019,
            'season_number': 2019,
            'series': 'Frank & Kastaniegaarden',
            'episode_number': 1,
-            'episode': 'Episode 1',
+            'episode': 'Frank & Kastaniegaarden',
+            'thumbnail': r're:https?://.+',
        },
        'params': {
            'skip_download': True,
@ -118,40 +98,6 @@ class DRTVIE(InfoExtractor):
    }, {
        'url': 'https://www.dr.dk/drtv/program/jagten_220924',
        'only_matching': True,
-    }, {
-        'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3',
-        'info_dict': {
-            'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113',
-            'title': "Regionale nyheder",
-            'ext': 'mp4',
-            'duration': 120.043,
-            'series': 'P4 Østjylland regionale nyheder',
-            'timestamp': 1651746600,
-            'season': 'Regionale nyheder',
-            'release_year': 0,
-            'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5',
-            'description': '',
-            'upload_date': '20220505',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'skip': 'this video has been removed',
-    }, {
-        'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/regionale-nyheder-2023-03-14-10-30-9',
-        'info_dict': {
-            'ext': 'mp4',
-            'id': '14802310112',
-            'timestamp': 1678786200,
-            'duration': 120.043,
-            'season_id': 'urn:dr:mu:bundle:63a4f7c87140143504b6710f',
-            'series': 'P4 København regionale nyheder',
-            'upload_date': '20230314',
-            'release_year': 0,
-            'description': 'Hør seneste regionale nyheder fra P4 København.',
-            'season': 'Regionale nyheder',
-            'title': 'Regionale nyheder',
-        },
    }]

    _TOKEN = None
@ -176,22 +122,38 @@ class DRTVIE(InfoExtractor):
                    'Catalog',
                ],
                'optout': True,
-            }).encode('utf-8'))
+            }).encode())

        self._TOKEN = traverse_obj(
            token_response, (lambda _, x: x['type'] == 'UserAccount', 'value'), get_all=False)

    def _real_extract(self, url):
-        raw_video_id = self._match_valid_url(url).group('id')
-        webpage = self._download_webpage(url, raw_video_id)
+        url_slug = self._match_id(url)
+        webpage = self._download_webpage(url, url_slug)

-        json_data = self._search_json(r'window\.__data\s*=\s*', webpage, 'data', raw_video_id)
-        item = traverse_obj(json_data, ('cache', 'page', ..., (None, ('entries', 0)), 'item'), get_all=False) or {}
-        item_id = item.get('id') or raw_video_id.rsplit('_', 1)[-1]
-        video_id = try_call(item['customId'].split(':')[-1]) or item_id
+        json_data = self._search_json(r'window\.__data\s*=\s*', webpage, 'data', url_slug, fatal=False) or {}
+        item = traverse_obj(json_data, ('cache', 'page', ..., (None, ('entries', 0)), 'item'), get_all=False)
+        if item:
+            item_id = item.get('id')
+        else:
+            item_id = url_slug.rsplit('_', 1)[-1]
+            item = self._download_json(
+                f'https://production-cdn.dr-massive.com/api/items/{item_id}', item_id,
+                note='Attempting to download backup item data', query={
+                    'device': 'web_browser',
+                    'expand': 'all',
+                    'ff': 'idp,ldp,rpt',
+                    'geoLocation': 'dk',
+                    'isDeviceAbroad': 'false',
+                    'lang': 'da',
+                    'segments': 'drtv,optedout',
+                    'sub': 'Anonymous',
+                })

+        video_id = try_call(lambda: item['customId'].split(':')[-1]) or item_id
        stream_data = self._download_json(
-            f'https://production.dr-massive.com/api/account/items/{item_id}/videos', video_id, query={
+            f'https://production.dr-massive.com/api/account/items/{item_id}/videos', video_id,
+            note='Downloading stream data', query={
                'delivery': 'stream',
                'device': 'web_browser',
                'ff': 'idp,ldp,rpt',
@ -211,17 +173,18 @@ class DRTVIE(InfoExtractor):
            preference = None
            if access_service in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'):
                preference = -1
-                format_id += '-%s' % access_service
+                format_id += f'-{access_service}'
            elif access_service == 'StandardVideo':
                preference = 1
-            fmts, subs = self._extract_m3u8_formats_and_subtitles(fmt['url'], video_id, preference=preference, m3u8_id=format_id, fatal=False)
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                fmt.get('url'), video_id, preference=preference, m3u8_id=format_id, fatal=False)
            formats.extend(fmts)
            self._merge_subtitles(subs, target=subtitles)
            LANGS = {
                'DanishLanguageSubtitles': 'da',
            }

-            for subs in fmt['subtitles']:
+            for subs in fmt.get('subtitles', []):
                if not isinstance(subs, dict):
                    continue
                sub_uri = url_or_none(subs.get('link'))
@ -241,10 +204,10 @@ class DRTVIE(InfoExtractor):
                'title': 'title',
                'description': 'description',
                'thumbnail': ('images', 'wallpaper'),
-                'timestamp': ('customFields', 'BroadcastTimeDK', {parse_iso8601}),
+                'release_timestamp': ('customFields', 'BroadcastTimeDK', {parse_iso8601}),
                'duration': 'duration',
-                'series': ('season', 'title'),
-                'season': ('season', 'show', 'title'),
+                'series': ('season', 'show', 'title'),
+                'season': ('season', 'title'),
                'season_number': ('season', 'seasonNumber', {int_or_none}),
                'season_id': 'seasonId',
                'episode': 'episodeName',
Author	SHA1	Message	Date
sepro	49a10a5ea8	Make sub parsing non fatal	2023-11-07 05:20:45 +01:00
sepro	51d214d85c	Remove lyd tests	2023-11-07 04:59:51 +01:00
sepro	09a90428f0	Fix typo	2023-11-07 04:58:33 +01:00
sepro	42c8eadf16	Cleanup Part 2	2023-11-07 04:56:40 +01:00