Update yt_dlp/extractor/bilibili.py

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
extract baseclass
2024-11-07 16:01:27 +01:00 · 2023-11-15 21:20:24 -05:00 · 2023-11-15 21:16:51 -05:00 · 2023-11-15 21:01:24 -05:00 · 2023-11-15 20:57:45 -05:00 · 2023-11-15 20:24:01 -05:00
1 changed files with 75 additions and 65 deletions
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@ -14,7 +14,6 @@ from ..networking.exceptions import HTTPError
 from ..utils import (
    ExtractorError,
    GeoRestrictedError,
-    RegexNotFoundError,
    InAdvancePagedList,
    OnDemandPagedList,
    bool_or_none,
@ -118,10 +117,11 @@ class BilibiliBaseIE(InfoExtractor):
            'https://api.bilibili.com/x/player/v2', video_id,
            query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid},
            note=f'Extracting subtitle info {cid}'), ('data', 'subtitle'))
-        if not traverse_obj(subtitle_info, 'subtitles') and traverse_obj(subtitle_info, 'allow_submit'):
+        subs_list = traverse_obj(subtitle_info, ('subtitles', lambda _, v: v['subtitle_url'] and v['lan']))
+        if not subs_list and traverse_obj(subtitle_info, 'allow_submit'):
            if not self._get_cookies('https://api.bilibili.com').get('SESSDATA'):  # no login session cookie
-                self.report_warning(f'CC subtitles (if exist) are only visible when logged in. {self._login_hint()}')
-        for s in traverse_obj(subtitle_info, ('subtitles', ...)):
+                self.report_warning(f'CC subtitles (if any) are only visible when logged in. {self._login_hint()}')
+        for s in subs_list:
            subtitles.setdefault(s['lan'], []).append({
                'ext': 'srt',
                'data': self.json2srt(self._download_json(s['subtitle_url'], video_id))
@ -173,7 +173,8 @@ class BilibiliBaseIE(InfoExtractor):
                lambda _, v: url_or_none(v['share_url']) and v['id'])):
            yield self.url_result(entry['share_url'], BiliBiliBangumiIE, str_or_none(entry.get('id')))

-    def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges={}):
+    def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges=None):
+        cid_edges = cid_edges or {}
        division_data = self._download_json(
            'https://api.bilibili.com/x/stein/edgeinfo_v2', video_id,
            query={'graph_version': graph_version, 'edge_id': edge_id, 'bvid': video_id},
@ -192,9 +193,9 @@ class BilibiliBaseIE(InfoExtractor):
                'text': ('option', {str}),
            }),
        })))
-        # use dict to combine edges that use the save video section (same cid)
+        # use dict to combine edges that use the same video section (same cid)
        cid_edges.setdefault(edges[edge_id]['cid'], {})[edge_id] = edges[edge_id]
-        for choice in edges[edge_id].get('choices', []):
+        for choice in traverse_obj(edges, ('edge_id', 'choices', ...)):
            if choice['edge_id'] not in edges:
                edges[choice['edge_id']] = {'cid': choice['cid']}
                self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges)
@ -203,8 +204,8 @@ class BilibiliBaseIE(InfoExtractor):
    def _get_interactive_entries(self, video_id, cid, metainfo):
        graph_version = traverse_obj(
            self._download_json(
-                f'https://api.bilibili.com/x/player/wbi/v2?bvid={video_id}&cid={cid}',
-                video_id, note='Extracting graph version'),
+                'https://api.bilibili.com/x/player/wbi/v2', video_id,
+                'Extracting graph version', query={'bvid': video_id, 'cid': cid}),
            ('data', 'interaction', 'graph_version', {int_or_none}))
        cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1)
        for cid, edges in cid_edges.items():
@ -477,21 +478,25 @@ class BiliBiliIE(BilibiliBaseIE):
        if is_festival:
            video_data = initial_state['videoInfo']
        else:
-            try:
-                play_info_obj = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)
-                play_info = play_info_obj['data']
-            except KeyError:
-                if play_info_obj.get('code') == 87007:
-                    toast = get_element_by_class('tips-toast', webpage) or ''
-                    msg = clean_html(f'{get_element_by_class("belongs-to", toast) or ""}，{get_element_by_class("level", toast) or ""}')
-                    raise ExtractorError(f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True)
-                raise ExtractorError('Failed to extract play_info')
-            except RegexNotFoundError:
+            play_info_obj = self._search_json(
+                r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False)
+            if not play_info_obj:
                if traverse_obj(initial_state, ('error', 'trueCode')) == -403:
                    self.raise_login_required()
                if traverse_obj(initial_state, ('error', 'trueCode')) == -404:
-                    self.report_warning('This video may be deleted or geo-restricted. You might want to try a VPN or a proxy server (with --proxy)', video_id)
-                raise
+                    raise ExtractorError(
+                        'This video may be deleted or geo-restricted. '
+                        'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
+            play_info = traverse_obj(play_info_obj, ('data', {dict}))
+            if not play_info:
+                if traverse_obj(play_info_obj, 'code') == 87007:
+                    toast = get_element_by_class('tips-toast', webpage) or ''
+                    msg = clean_html(
+                        f'{get_element_by_class("belongs-to", toast) or ""}，'
+                        + (get_element_by_class('level', toast) or ''))
+                    raise ExtractorError(
+                        f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True)
+                raise ExtractorError('Failed to extract play info')
            video_data = initial_state['videoData']

        video_id, title = video_data['bvid'], video_data.get('title')
@ -741,14 +746,16 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE):
        media_id = self._match_id(url)
        webpage = self._download_webpage(url, media_id)

-        initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)
+        initial_state = self._search_json(
+            r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)
        ss_id = initial_state['mediaInfo']['season_id']
-        metainfo = traverse_obj(initial_state, ('mediaInfo', {
-            'title': ('title', {str}),
-            'description': ('evaluate', {str}),
-        }))

-        return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id, **metainfo)
+        return self.playlist_result(
+            self._get_episodes_from_season(ss_id, url), media_id,
+            **traverse_obj(initial_state, ('mediaInfo', {
+                'title': ('title', {str}),
+                'description': ('evaluate', {str}),
+            })))


 class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
@ -803,30 +810,10 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
        return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id, **metainfo)


-class BilibiliCheeseIE(BilibiliBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P<id>\d+)'
-    _TESTS = [{
-        'url': 'https://www.bilibili.com/cheese/play/ep229832',
-        'info_dict': {
-            'id': '229832',
-            'ext': 'mp4',
-            'title': '1 - 课程先导片',
-            'alt_title': '视频课 · 3分41秒',
-            'uploader': '马督工',
-            'uploader_id': '316568752',
-            'episode': '课程先导片',
-            'episode_id': '229832',
-            'episode_number': 1,
-            'duration': 221,
-            'timestamp': 1695549606,
-            'upload_date': '20230924',
-            'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
-            'view_count': int,
-        }
-    }]
+class BilibiliCheeseBaseIE(BilibiliBaseIE):
    _HEADERS = {'Referer': 'https://www.bilibili.com/'}

-    def _extract_episode(self, season_info, ep_id, headers):
+    def _extract_episode(self, season_info, ep_id):
        episode_info = traverse_obj(season_info, (
            'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False)
        aid, cid = episode_info['aid'], episode_info['cid']
@ -837,9 +824,9 @@ class BilibiliCheeseIE(BilibiliBaseIE):
            self.raise_login_required('You need to purchase the course to download this episode')

        play_info = self._download_json(
-            'https://api.bilibili.com/pugv/player/web/playurl?fnval=16&fourk=1', ep_id,
-            query={'avid': aid, 'cid': cid, 'ep_id': ep_id},
-            headers=headers, note='Downloading playinfo')['data']
+            'https://api.bilibili.com/pugv/player/web/playurl', ep_id,
+            query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1},
+            headers=self._HEADERS, note='Downloading playinfo')['data']

        return {
            'id': str_or_none(ep_id),
@ -861,21 +848,43 @@ class BilibiliCheeseIE(BilibiliBaseIE):
            }),
            'subtitles': self.extract_subtitles(ep_id, cid, aid=aid),
            '__post_extractor': self.extract_comments(aid),
-            'http_headers': headers,
+            'http_headers': self._HEADERS,
        }

-    def _download_season_info(self, query_key, video_id, headers):
+    def _download_season_info(self, query_key, video_id):
        return self._download_json(
            f'https://api.bilibili.com/pugv/view/web/season?{query_key}={video_id}', video_id,
-            headers=headers, note='Downloading season info')['data']
+            headers=self._HEADERS, note='Downloading season info')['data']
+
+
+class BilibiliCheeseIE(BilibiliCheeseBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.bilibili.com/cheese/play/ep229832',
+        'info_dict': {
+            'id': '229832',
+            'ext': 'mp4',
+            'title': '1 - 课程先导片',
+            'alt_title': '视频课 · 3分41秒',
+            'uploader': '马督工',
+            'uploader_id': '316568752',
+            'episode': '课程先导片',
+            'episode_id': '229832',
+            'episode_number': 1,
+            'duration': 221,
+            'timestamp': 1695549606,
+            'upload_date': '20230924',
+            'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
+            'view_count': int,
+        }
+    }]

    def _real_extract(self, url):
        ep_id = self._match_id(url)
-        return self._extract_episode(
-            self._download_season_info('ep_id', ep_id, self._HEADERS), ep_id, self._HEADERS)
+        return self._extract_episode(self._download_season_info('ep_id', ep_id), ep_id)


-class BilibiliCheeseSeasonIE(BilibiliCheeseIE):
+class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE):
    _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ss(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://www.bilibili.com/cheese/play/ss5918',
@ -917,20 +926,21 @@ class BilibiliCheeseSeasonIE(BilibiliCheeseIE):
    def _get_cheese_entries(self, season_info):
        for ep_id in traverse_obj(season_info, ('episodes', lambda _, v: v['episode_can_view'], 'id')):
            yield {
-                **self._extract_episode(season_info, ep_id, self._HEADERS),
+                **self._extract_episode(season_info, ep_id),
                'extractor_key': BilibiliCheeseIE.ie_key(),
                'extractor': BilibiliCheeseIE.IE_NAME,
            }

    def _real_extract(self, url):
        season_id = self._match_id(url)
-        season_info = self._download_season_info('season_id', season_id, self._HEADERS)
-        metainfo = traverse_obj(season_info, {
-            'title': ('title', {str}),
-            'description': ('subtitle', {str}),
-        })
+        season_info = self._download_season_info('season_id', season_id)

-        return self.playlist_result(self._get_cheese_entries(season_info), season_id, **metainfo)
+        return self.playlist_result(
+            self._get_cheese_entries(season_info), season_id,
+            **traverse_obj(season_info, {
+                'title': ('title', {str}),
+                'description': ('subtitle', {str}),
+            }))


 class BilibiliSpaceBaseIE(InfoExtractor):
Author	SHA1	Message	Date
c-basalt	ace367aaf3	Update yt_dlp/extractor/bilibili.py Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>	2023-11-15 21:20:24 -05:00
c-basalt	f1323ae863	extract baseclass	2023-11-15 21:16:51 -05:00
c-basalt	53e732deba	format query	2023-11-15 21:01:24 -05:00
c-basalt	08f91358ab	inline metadata	2023-11-15 20:57:45 -05:00
c-basalt	54e0dbc355	Update yt_dlp/extractor/bilibili.py Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>	2023-11-15 20:24:01 -05:00
c-basalt	fd62b4991b	Apply suggestions from code review Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>	2023-11-15 20:19:22 -05:00
c-basalt	b224bfce9c	Apply suggestions from code review Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>	2023-11-15 20:15:51 -05:00