2024-09-23 19:01:24 +02:00
8 changed files with 64 additions and 359 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -2029,10 +2029,6 @@ from .tenplay import (
 from .testurl import TestURLIE
 from .tf1 import TF1IE
 from .tfo import TFOIE
-from .theguardian import (
-    TheGuardianPodcastIE,
-    TheGuardianPodcastPlaylistIE,
-)
 from .theholetv import TheHoleTvIE
 from .theintercept import TheInterceptIE
 from .theplatform import (
@ -2305,7 +2301,6 @@ from .vidio import (
    VidioLiveIE
 )
 from .vidlii import VidLiiIE
-from .vidly import VidlyIE
 from .viewlift import (
    ViewLiftIE,
    ViewLiftEmbedIE,
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -2341,9 +2341,7 @@ class InfoExtractor:
        imgs_count = 0

        srcs = set()
-        media = itertools.chain.from_iterable(
-            smil.findall(self._xpath_ns(arg, namespace))
-            for arg in ['.//video', './/audio', './/media'])
+        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
        for medium in media:
            src = medium.get('src')
            if not src or src in srcs:
--- a/yt_dlp/extractor/nintendo.py
+++ b/yt_dlp/extractor/nintendo.py
@ -1,131 +1,57 @@
-import json
-import urllib.parse
+import re

 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    make_archive_id,
-    unified_timestamp,
-    urljoin,
-)
-from ..utils.traversal import traverse_obj
+from .ooyala import OoyalaIE


 class NintendoIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:(?P<locale>\w{2}(?:-\w{2})?)/)?nintendo-direct/(?P<slug>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:games/detail|nintendo-direct)/(?P<id>[^/?#&]+)'
    _TESTS = [{
+        'url': 'https://www.nintendo.com/games/detail/duck-hunt-wii-u/',
+        'info_dict': {
+            'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW',
+            'ext': 'flv',
+            'title': 'Duck Hunt Wii U VC NES - Trailer',
+            'duration': 60.326,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u',
+        'info_dict': {
+            'id': 'tokyo-mirage-sessions-fe-wii-u',
+            'title': 'Tokyo Mirage Sessions ♯FE',
+        },
+        'playlist_count': 4,
+    }, {
        'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/',
        'info_dict': {
+            'id': 'J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V',
            'ext': 'mp4',
-            'id': '2oPmiviVePUA1IqAZzjuVh',
-            'display_id': '09-04-2019',
-            'title': 'Nintendo Direct 9.4.2019',
-            'timestamp': 1567580400,
-            'description': 'md5:8aac2780361d8cb772b6d1de66d7d6f4',
-            'upload_date': '20190904',
-            'age_limit': 17,
-            '_old_archive_ids': ['nintendo J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V'],
+            'title': 'Switch_ROS_ND0904-H264.mov',
+            'duration': 2324.758,
        },
-    }, {
-        'url': 'https://www.nintendo.com/en-ca/nintendo-direct/08-31-2023/',
-        'info_dict': {
-            'ext': 'mp4',
-            'id': '2TB2w2rJhNYF84qQ9E57hU',
-            'display_id': '08-31-2023',
-            'title': 'Super Mario Bros. Wonder Direct 8.31.2023',
-            'timestamp': 1693465200,
-            'description': 'md5:3067c5b824bcfdae9090a7f38ab2d200',
-            'tags': ['Mild Fantasy Violence', 'In-Game Purchases'],
-            'upload_date': '20230831',
-            'age_limit': 6,
-        },
-    }, {
-        'url': 'https://www.nintendo.com/us/nintendo-direct/50-fact-extravaganza/',
-        'info_dict': {
-            'ext': 'mp4',
-            'id': 'j0BBGzfw0pQ',
-            'channel_follower_count': int,
-            'view_count': int,
-            'description': 'Learn new details about Super Smash Bros. for Wii U, which launches on November 21.',
-            'duration': 2123,
-            'availability': 'public',
-            'thumbnail': 'https://i.ytimg.com/vi_webp/j0BBGzfw0pQ/maxresdefault.webp',
-            'timestamp': 1414047600,
-            'channel_id': 'UCGIY_O-8vW4rfX98KlMkvRg',
-            'chapters': 'count:53',
-            'heatmap': 'count:100',
-            'upload_date': '20141023',
-            'uploader_id': '@NintendoAmerica',
-            'playable_in_embed': True,
-            'categories': ['Gaming'],
-            'display_id': '50-fact-extravaganza',
-            'channel': 'Nintendo of America',
-            'tags': ['Comic Mischief', 'Cartoon Violence', 'Mild Suggestive Themes'],
-            'like_count': int,
-            'channel_url': 'https://www.youtube.com/channel/UCGIY_O-8vW4rfX98KlMkvRg',
-            'age_limit': 10,
-            'uploader_url': 'https://www.youtube.com/@NintendoAmerica',
-            'comment_count': int,
-            'live_status': 'not_live',
-            'uploader': 'Nintendo of America',
-            'title': '50-FACT Extravaganza',
+        'params': {
+            'skip_download': True,
        },
+        'add_ie': ['Ooyala'],
    }]

-    def _create_asset_url(self, path):
-        return urljoin('https://assets.nintendo.com/', urllib.parse.quote(path))
-
    def _real_extract(self, url):
-        locale, slug = self._match_valid_url(url).group('locale', 'slug')
+        page_id = self._match_id(url)

-        language, _, country = (locale or 'US').rpartition('-')
-        parsed_locale = f'{language.lower() or "en"}_{country.upper()}'
-        self.write_debug(f'Using locale {parsed_locale} (from {locale})', only_once=True)
+        webpage = self._download_webpage(url, page_id)

-        response = self._download_json('https://graph.nintendo.com/', slug, query={
-            'operationName': 'NintendoDirect',
-            'variables': json.dumps({
-                'locale': parsed_locale,
-                'slug': slug,
-            }, separators=(',', ':')),
-            'extensions': json.dumps({
-                'persistedQuery': {
-                    'version': 1,
-                    'sha256Hash': '969b16fe9f08b686fa37bc44d1fd913b6188e65794bb5e341c54fa683a8004cb'
-                },
-            }, separators=(',', ':')),
-        })
-        # API returns `{"data": {"direct": null}}` if no matching id
-        direct_info = traverse_obj(response, ('data', 'direct', {dict}))
-        if not direct_info:
-            raise ExtractorError(f'No Nintendo Direct with id {slug} exists', expected=True)
+        entries = [
+            OoyalaIE._build_url_result(m.group('code'))
+            for m in re.finditer(
+                r'data-(?:video-id|directVideoId)=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage)]

-        errors = ', '.join(traverse_obj(response, ('errors', ..., 'message')))
-        if errors:
-            raise ExtractorError(f'GraphQL API error: {errors or "Unknown error"}')
+        title = self._html_search_regex(
+            r'(?s)<(?:span|div)[^>]+class="(?:title|wrapper)"[^>]*>.*?<h1>(.+?)</h1>',
+            webpage, 'title', fatal=False)

-        result = traverse_obj(direct_info, {
-            'id': ('id', {str}),
-            'title': ('name', {str}),
-            'timestamp': ('startDate', {unified_timestamp}),
-            'description': ('description', 'text', {str}),
-            'age_limit': ('contentRating', 'order', {int}),
-            'tags': ('contentDescriptors', ..., 'label', {str}),
-            'thumbnail': ('thumbnail', {self._create_asset_url}),
-        })
-        result['display_id'] = slug
-
-        asset_id = traverse_obj(direct_info, ('video', 'publicId', {str}))
-        if not asset_id:
-            youtube_id = traverse_obj(direct_info, ('liveStream', {str}))
-            if not youtube_id:
-                self.raise_no_formats('Could not find any video formats', video_id=slug)
-
-            return self.url_result(youtube_id, **result, url_transparent=True)
-
-        if asset_id.startswith('Legacy Videos/'):
-            result['_old_archive_ids'] = [make_archive_id(self, asset_id[14:])]
-        result['formats'] = self._extract_m3u8_formats(
-            self._create_asset_url(f'/video/upload/sp_full_hd/v1/{asset_id}.m3u8'), slug)
-
-        return result
+        return self.playlist_result(
+            entries, page_id, title)
--- a/yt_dlp/extractor/periscope.py
+++ b/yt_dlp/extractor/periscope.py
@ -35,7 +35,6 @@ class PeriscopeBaseIE(InfoExtractor):
            'uploader_id': broadcast.get('user_id') or broadcast.get('username'),
            'thumbnails': thumbnails,
            'view_count': int_or_none(broadcast.get('total_watched')),
-            'concurrent_view_count': int_or_none(broadcast.get('total_watching')),
            'tags': broadcast.get('tags'),
            'live_status': {
                'running': 'is_live',
--- a/yt_dlp/extractor/theguardian.py
+++ b/yt_dlp/extractor/theguardian.py
@ -1,135 +0,0 @@
-import itertools
-
-from .common import InfoExtractor
-from ..utils import (
-    clean_html,
-    extract_attributes,
-    get_element_by_class,
-    get_element_html_by_class,
-    get_elements_html_by_class,
-    parse_qs,
-    traverse_obj,
-    unified_strdate,
-    urljoin
-)
-
-
-class TheGuardianPodcastIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)'
-    _TESTS = [{
-        'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
-        'md5': 'd1771744681789b4cd7da2a08e487702',
-        'info_dict': {
-            'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
-            'ext': 'mp3',
-            'title': '‘We are just getting started’: the plastic-eating bacteria that could change the world – podcast',
-            'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee',
-            'creator': 'Stephen Buranyi',
-            'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79',
-            'release_date': '20231103'
-        }
-    }, {
-        'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
-        'md5': 'd1771744681789b4cd7da2a08e487702',
-        'info_dict': {
-            'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
-            'ext': 'mp3',
-            'title': 'The trials of Robert Habeck: is the world’s most powerful green politician doomed to fail? – podcast',
-            'description': 'md5:1b5cf6582d1771c6b7077784b5456994',
-            'creator': 'Philip Oltermann',
-            'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080',
-            'release_date': '20231030'
-        }
-    }, {
-        'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
-        'md5': 'a2fcff6f8e060a95b1483295273dc35e',
-        'info_dict': {
-            'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
-            'ext': 'mp3',
-            'title': 'Arsenal feel hard done by and Luton hold Liverpool – Football Weekly',
-            'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a',
-            'creator': 'Max Rushden',
-            'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd',
-            'release_date': '20231106'
-        }
-    }, {
-        'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast',
-        'md5': '06a0f7e9701a80c8064a5d35690481ec',
-        'info_dict': {
-            'id': 'the-covid-inquiry-politics-weekly-uk-podcast',
-            'ext': 'mp3',
-            'title': 'The Covid inquiry | Politics Weekly UK - podcast',
-            'description': 'md5:207c98859c14903582b17d25b014046e',
-            'creator': 'Gaby Hinsliff',
-            'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3',
-            'release_date': '20231102'
-        }
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        return {
-            'id': video_id,
-            'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage),
-            'description': self._og_search_description(webpage),
-            'creator': self._html_search_meta('author', webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)),
-            'url': extract_attributes(get_element_html_by_class(
-                'podcast__player', webpage) or '').get('data-source'),
-        }
-
-
-class TheGuardianPodcastPlaylistIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?'
-    _TESTS = [{
-        'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly',
-        'info_dict': {
-            'id': 'theguardianswomensfootballweekly',
-            'title': "The Guardian's Women's Football Weekly",
-            'description': 'md5:e2cc021311e582d29935a73614a43f51'
-        },
-        'playlist_mincount': 69
-    }, {
-        'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2',
-        'info_dict': {
-            'id': 'todayinfocus',
-            'title': 'Today in Focus',
-            'description': 'md5:0f097764fc0d359e0b6eb537be0387e2'
-        },
-        'playlist_mincount': 1261
-    }, {
-        'url': 'https://www.theguardian.com/news/series/the-audio-long-read',
-        'info_dict': {
-            'id': 'the-audio-long-read',
-            'title': 'The Audio Long Read',
-            'description': 'md5:5462994a27527309562b25b6defc4ef3'
-        },
-        'playlist_mincount': 996
-    }]
-
-    def _entries(self, url, playlist_id):
-        for page in itertools.count(1):
-            webpage, urlh = self._download_webpage_handle(
-                url, playlist_id, f'Downloading page {page}', query={'page': page})
-            if 'page' not in parse_qs(urlh.url):
-                break
-
-            episodes = get_elements_html_by_class('fc-item--type-media', webpage)
-            for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'data-id')):
-                yield url_path
-
-    def _real_extract(self, url):
-        podcast_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, podcast_id)
-
-        title = clean_html(get_element_by_class(
-            'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage))
-        description = self._og_search_description(webpage) or self._html_search_meta(
-            'description', webpage)
-
-        return self.playlist_from_matches(
-            self._entries(url, podcast_id), podcast_id, title, description=description,
-            ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x))
--- a/yt_dlp/extractor/twitcasting.py
+++ b/yt_dlp/extractor/twitcasting.py
@ -11,6 +11,7 @@ from ..utils import (
    float_or_none,
    get_element_by_class,
    get_element_by_id,
+    int_or_none,
    parse_duration,
    qualities,
    str_to_int,
@ -241,31 +242,35 @@ class TwitCastingLiveIE(InfoExtractor):
        'expected_exception': 'UserNotLive',
    }]

+    _PROTECTED_LIVE_RE = r'(?s)(<span\s*class="tw-movie-thumbnail2-badge"\s*data-status="live">\s*LIVE)'
+
    def _real_extract(self, url):
        uploader_id = self._match_id(url)
        self.to_screen(
            'Downloading live video of user {0}. '
            'Pass "https://twitcasting.tv/{0}/show" to download the history'.format(uploader_id))

-        is_live = traverse_obj(self._download_json(
-            f'https://frontendapi.twitcasting.tv/watch/user/{uploader_id}',
-            uploader_id, 'Checking live status', data=b'', fatal=False), ('is_live', {bool}))
-        if is_live is False:  # only raise here if API response was as expected
-            raise UserNotLive(video_id=uploader_id)
-
-        # Use /show/ page so that password-protected and members-only livestreams can be found
+        webpage = self._download_webpage(url, uploader_id)
+        is_live = self._search_regex(  # first pattern is for public live
+            (r'(data-is-onlive="true")', self._PROTECTED_LIVE_RE), webpage, 'is live?', default=None)
+        current_live = int_or_none(self._search_regex(
+            (r'data-type="movie" data-id="(\d+)">',  # not available?
+             r'tw-sound-flag-open-link" data-id="(\d+)" style=',  # not available?
+             r'data-movie-id="(\d+)"'),  # if not currently live, value may be 0
+            webpage, 'current live ID', default=None))
+        if is_live and not current_live:
+            # fetch unfiltered /show to find running livestreams; we can't get ID of the password-protected livestream above
            webpage = self._download_webpage(
-            f'https://twitcasting.tv/{uploader_id}/show/', uploader_id, 'Downloading live history')
-        is_live = is_live or self._search_regex(
-            r'(?s)(<span\s*class="tw-movie-thumbnail2-badge"\s*data-status="live">\s*LIVE)',
-            webpage, 'is live?', default=False)
-        # Current live is always the first match
+                f'https://twitcasting.tv/{uploader_id}/show/', uploader_id,
+                note='Downloading live history')
+            is_live = self._search_regex(self._PROTECTED_LIVE_RE, webpage, 'is live?', default=None)
+            if is_live:
+                # get the first live; running live is always at the first
                current_live = self._search_regex(
-            r'(?s)<a\s+class="tw-movie-thumbnail2"\s+href="/[^/"]+/movie/(?P<video_id>\d+)"',
-            webpage, 'current live ID', default=None, group='video_id')
-        if not is_live or not current_live:
+                    r'(?s)<a\s+class="tw-movie-thumbnail2"\s*href="/[^/]+/movie/(?P<video_id>\d+)"\s*>.+?</a>',
+                    webpage, 'current live ID 2', default=None, group='video_id')
+        if not current_live:
            raise UserNotLive(video_id=uploader_id)
-
        return self.url_result(f'https://twitcasting.tv/{uploader_id}/movie/{current_live}', TwitCastingIE)


--- a/yt_dlp/extractor/vidly.py
+++ b/yt_dlp/extractor/vidly.py
@ -1,83 +0,0 @@
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    mimetype2ext,
-    url_or_none,
-)
-from ..utils.traversal import traverse_obj
-
-
-class VidlyIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:vid\.ly/|(?:s\.)?vid\.ly/embeded\.html\?(?:[^#]+&)?link=)(?P<id>\w+)'
-    _EMBED_REGEX = [r'<script[^>]+\bsrc=[\'"](?P<url>(?:https?:)?//vid\.ly/\w+/embed[^\'"]+)',
-                    r'<iframe[^>]+\bsrc=[\'"](?P<url>(?:https?:)?//(?:s\.)?vid\.ly/embeded\.html\?(?:[^#\'"]+&)?link=\w+[^\'"]+)']
-    _TESTS = [{
-        # JWPlayer 7, Embeds forbidden
-        'url': 'https://vid.ly/2i3o9j/embed',
-        'info_dict': {
-            'id': '2i3o9j',
-            'ext': 'mp4',
-            'title': '2i3o9j',
-            'thumbnail': r're:https://\w+\.cloudfront\.net/',
-        },
-    }, {
-        # JWPlayer 6
-        'url': 'http://s.vid.ly/embeded.html?link=jw_test&new=1&autoplay=true&controls=true',
-        'info_dict': {
-            'id': 'jw_test',
-            'ext': 'mp4',
-            'title': '2x8m8t',
-            'thumbnail': r're:https://\w+\.cloudfront\.net/',
-        },
-    }, {
-        # Vidlyplayer
-        'url': 'https://vid.ly/7x0e6l',
-        'info_dict': {
-            'id': '7x0e6l',
-            'ext': 'mp4',
-            'title': '7x0e6l',
-        },
-    }]
-    _WEBPAGE_TESTS = [{
-        'url': 'https://www.petfinder.com/dog/gus-57378930/tn/ooltewah/furever-furkids-rescue-tn592/',
-        'info_dict': {
-            'id': 'w8p5b0',
-            'ext': 'mp4',
-            'title': 'w8p5b0',
-            'thumbnail': r're:https://\w+\.cloudfront\.net/',
-        }
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        embed_script = self._download_webpage(
-            f'https://vid.ly/{video_id}/embed', video_id, headers={'Referer': 'https://vid.ly/'})
-        player = self._search_json(r'initCallback\(', embed_script, 'player', video_id)
-
-        player_type = player.get('player') or ''
-        if player_type.startswith('jwplayer'):
-            return self._parse_jwplayer_data(player['config'], video_id)
-        elif not player_type.startswith('vidly'):
-            raise ExtractorError(f'Unknown player type {player_type!r}')
-
-        formats = []
-        ext = mimetype2ext(traverse_obj(player, ('config', 'type')))
-        for source, fid in [('source', 'sd'), ('source_hd', 'hd')]:
-            if traverse_obj(player, ('config', source, {url_or_none})):
-                formats.append({
-                    'url': player['config'][source],
-                    'format_id': f'http-{fid}',
-                    'ext': ext,
-                })
-        # Has higher quality formats
-        formats.extend(self._extract_m3u8_formats(
-            f'https://d3fenhwk93s16g.cloudfront.net/{video_id}/hls.m3u8', video_id,
-            fatal=False, note='Requesting higher quality m3u8 formats',
-            errnote='No higher quality m3u8 formats found') or [])
-
-        return {
-            'id': video_id,
-            'title': video_id,
-            'formats': formats,
-        }
--- a/yt_dlp/extractor/vocaroo.py
+++ b/yt_dlp/extractor/vocaroo.py
@ -57,7 +57,7 @@ class VocarooIE(InfoExtractor):
            'title': '',
            'url': url,
            'ext': 'mp3',
-            'timestamp': float_or_none(resp.headers.get('x-bz-upload-timestamp'), scale=1000),
+            'timestamp': float_or_none(resp.getheader('x-bz-upload-timestamp'), scale=1000),
            'vcodec': 'none',
            'http_headers': http_headers,
        }