Compare commits

..

7 Commits

Author SHA1 Message Date
c-basalt
ace367aaf3
Update yt_dlp/extractor/bilibili.py
Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2023-11-15 21:20:24 -05:00
c-basalt
f1323ae863 extract baseclass 2023-11-15 21:16:51 -05:00
c-basalt
53e732deba format query 2023-11-15 21:01:24 -05:00
c-basalt
08f91358ab inline metadata 2023-11-15 20:57:45 -05:00
c-basalt
54e0dbc355
Update yt_dlp/extractor/bilibili.py
Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2023-11-15 20:24:01 -05:00
c-basalt
fd62b4991b
Apply suggestions from code review
Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2023-11-15 20:19:22 -05:00
c-basalt
b224bfce9c
Apply suggestions from code review
Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2023-11-15 20:15:51 -05:00

View File

@ -14,7 +14,6 @@ from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
GeoRestrictedError, GeoRestrictedError,
RegexNotFoundError,
InAdvancePagedList, InAdvancePagedList,
OnDemandPagedList, OnDemandPagedList,
bool_or_none, bool_or_none,
@ -118,10 +117,11 @@ class BilibiliBaseIE(InfoExtractor):
'https://api.bilibili.com/x/player/v2', video_id, 'https://api.bilibili.com/x/player/v2', video_id,
query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid}, query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid},
note=f'Extracting subtitle info {cid}'), ('data', 'subtitle')) note=f'Extracting subtitle info {cid}'), ('data', 'subtitle'))
if not traverse_obj(subtitle_info, 'subtitles') and traverse_obj(subtitle_info, 'allow_submit'): subs_list = traverse_obj(subtitle_info, ('subtitles', lambda _, v: v['subtitle_url'] and v['lan']))
if not subs_list and traverse_obj(subtitle_info, 'allow_submit'):
if not self._get_cookies('https://api.bilibili.com').get('SESSDATA'): # no login session cookie if not self._get_cookies('https://api.bilibili.com').get('SESSDATA'): # no login session cookie
self.report_warning(f'CC subtitles (if exist) are only visible when logged in. {self._login_hint()}') self.report_warning(f'CC subtitles (if any) are only visible when logged in. {self._login_hint()}')
for s in traverse_obj(subtitle_info, ('subtitles', ...)): for s in subs_list:
subtitles.setdefault(s['lan'], []).append({ subtitles.setdefault(s['lan'], []).append({
'ext': 'srt', 'ext': 'srt',
'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id))
@ -173,7 +173,8 @@ class BilibiliBaseIE(InfoExtractor):
lambda _, v: url_or_none(v['share_url']) and v['id'])): lambda _, v: url_or_none(v['share_url']) and v['id'])):
yield self.url_result(entry['share_url'], BiliBiliBangumiIE, str_or_none(entry.get('id'))) yield self.url_result(entry['share_url'], BiliBiliBangumiIE, str_or_none(entry.get('id')))
def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges={}): def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges=None):
cid_edges = cid_edges or {}
division_data = self._download_json( division_data = self._download_json(
'https://api.bilibili.com/x/stein/edgeinfo_v2', video_id, 'https://api.bilibili.com/x/stein/edgeinfo_v2', video_id,
query={'graph_version': graph_version, 'edge_id': edge_id, 'bvid': video_id}, query={'graph_version': graph_version, 'edge_id': edge_id, 'bvid': video_id},
@ -192,9 +193,9 @@ class BilibiliBaseIE(InfoExtractor):
'text': ('option', {str}), 'text': ('option', {str}),
}), }),
}))) })))
# use dict to combine edges that use the save video section (same cid) # use dict to combine edges that use the same video section (same cid)
cid_edges.setdefault(edges[edge_id]['cid'], {})[edge_id] = edges[edge_id] cid_edges.setdefault(edges[edge_id]['cid'], {})[edge_id] = edges[edge_id]
for choice in edges[edge_id].get('choices', []): for choice in traverse_obj(edges, ('edge_id', 'choices', ...)):
if choice['edge_id'] not in edges: if choice['edge_id'] not in edges:
edges[choice['edge_id']] = {'cid': choice['cid']} edges[choice['edge_id']] = {'cid': choice['cid']}
self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges) self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges)
@ -203,8 +204,8 @@ class BilibiliBaseIE(InfoExtractor):
def _get_interactive_entries(self, video_id, cid, metainfo): def _get_interactive_entries(self, video_id, cid, metainfo):
graph_version = traverse_obj( graph_version = traverse_obj(
self._download_json( self._download_json(
f'https://api.bilibili.com/x/player/wbi/v2?bvid={video_id}&cid={cid}', 'https://api.bilibili.com/x/player/wbi/v2', video_id,
video_id, note='Extracting graph version'), 'Extracting graph version', query={'bvid': video_id, 'cid': cid}),
('data', 'interaction', 'graph_version', {int_or_none})) ('data', 'interaction', 'graph_version', {int_or_none}))
cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1) cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1)
for cid, edges in cid_edges.items(): for cid, edges in cid_edges.items():
@ -477,21 +478,25 @@ class BiliBiliIE(BilibiliBaseIE):
if is_festival: if is_festival:
video_data = initial_state['videoInfo'] video_data = initial_state['videoInfo']
else: else:
try: play_info_obj = self._search_json(
play_info_obj = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id) r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False)
play_info = play_info_obj['data'] if not play_info_obj:
except KeyError:
if play_info_obj.get('code') == 87007:
toast = get_element_by_class('tips-toast', webpage) or ''
msg = clean_html(f'{get_element_by_class("belongs-to", toast) or ""}{get_element_by_class("level", toast) or ""}')
raise ExtractorError(f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True)
raise ExtractorError('Failed to extract play_info')
except RegexNotFoundError:
if traverse_obj(initial_state, ('error', 'trueCode')) == -403: if traverse_obj(initial_state, ('error', 'trueCode')) == -403:
self.raise_login_required() self.raise_login_required()
if traverse_obj(initial_state, ('error', 'trueCode')) == -404: if traverse_obj(initial_state, ('error', 'trueCode')) == -404:
self.report_warning('This video may be deleted or geo-restricted. You might want to try a VPN or a proxy server (with --proxy)', video_id) raise ExtractorError(
raise 'This video may be deleted or geo-restricted. '
'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
play_info = traverse_obj(play_info_obj, ('data', {dict}))
if not play_info:
if traverse_obj(play_info_obj, 'code') == 87007:
toast = get_element_by_class('tips-toast', webpage) or ''
msg = clean_html(
f'{get_element_by_class("belongs-to", toast) or ""}'
+ (get_element_by_class('level', toast) or ''))
raise ExtractorError(
f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True)
raise ExtractorError('Failed to extract play info')
video_data = initial_state['videoData'] video_data = initial_state['videoData']
video_id, title = video_data['bvid'], video_data.get('title') video_id, title = video_data['bvid'], video_data.get('title')
@ -741,14 +746,16 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE):
media_id = self._match_id(url) media_id = self._match_id(url)
webpage = self._download_webpage(url, media_id) webpage = self._download_webpage(url, media_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) initial_state = self._search_json(
r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)
ss_id = initial_state['mediaInfo']['season_id'] ss_id = initial_state['mediaInfo']['season_id']
metainfo = traverse_obj(initial_state, ('mediaInfo', {
return self.playlist_result(
self._get_episodes_from_season(ss_id, url), media_id,
**traverse_obj(initial_state, ('mediaInfo', {
'title': ('title', {str}), 'title': ('title', {str}),
'description': ('evaluate', {str}), 'description': ('evaluate', {str}),
})) })))
return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id, **metainfo)
class BiliBiliBangumiSeasonIE(BilibiliBaseIE): class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
@ -803,30 +810,10 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id, **metainfo) return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id, **metainfo)
class BilibiliCheeseIE(BilibiliBaseIE): class BilibiliCheeseBaseIE(BilibiliBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/cheese/play/ep229832',
'info_dict': {
'id': '229832',
'ext': 'mp4',
'title': '1 - 课程先导片',
'alt_title': '视频课·3分41秒',
'uploader': '马督工',
'uploader_id': '316568752',
'episode': '课程先导片',
'episode_id': '229832',
'episode_number': 1,
'duration': 221,
'timestamp': 1695549606,
'upload_date': '20230924',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'view_count': int,
}
}]
_HEADERS = {'Referer': 'https://www.bilibili.com/'} _HEADERS = {'Referer': 'https://www.bilibili.com/'}
def _extract_episode(self, season_info, ep_id, headers): def _extract_episode(self, season_info, ep_id):
episode_info = traverse_obj(season_info, ( episode_info = traverse_obj(season_info, (
'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False) 'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False)
aid, cid = episode_info['aid'], episode_info['cid'] aid, cid = episode_info['aid'], episode_info['cid']
@ -837,9 +824,9 @@ class BilibiliCheeseIE(BilibiliBaseIE):
self.raise_login_required('You need to purchase the course to download this episode') self.raise_login_required('You need to purchase the course to download this episode')
play_info = self._download_json( play_info = self._download_json(
'https://api.bilibili.com/pugv/player/web/playurl?fnval=16&fourk=1', ep_id, 'https://api.bilibili.com/pugv/player/web/playurl', ep_id,
query={'avid': aid, 'cid': cid, 'ep_id': ep_id}, query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1},
headers=headers, note='Downloading playinfo')['data'] headers=self._HEADERS, note='Downloading playinfo')['data']
return { return {
'id': str_or_none(ep_id), 'id': str_or_none(ep_id),
@ -861,21 +848,43 @@ class BilibiliCheeseIE(BilibiliBaseIE):
}), }),
'subtitles': self.extract_subtitles(ep_id, cid, aid=aid), 'subtitles': self.extract_subtitles(ep_id, cid, aid=aid),
'__post_extractor': self.extract_comments(aid), '__post_extractor': self.extract_comments(aid),
'http_headers': headers, 'http_headers': self._HEADERS,
} }
def _download_season_info(self, query_key, video_id, headers): def _download_season_info(self, query_key, video_id):
return self._download_json( return self._download_json(
f'https://api.bilibili.com/pugv/view/web/season?{query_key}={video_id}', video_id, f'https://api.bilibili.com/pugv/view/web/season?{query_key}={video_id}', video_id,
headers=headers, note='Downloading season info')['data'] headers=self._HEADERS, note='Downloading season info')['data']
class BilibiliCheeseIE(BilibiliCheeseBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/cheese/play/ep229832',
'info_dict': {
'id': '229832',
'ext': 'mp4',
'title': '1 - 课程先导片',
'alt_title': '视频课·3分41秒',
'uploader': '马督工',
'uploader_id': '316568752',
'episode': '课程先导片',
'episode_id': '229832',
'episode_number': 1,
'duration': 221,
'timestamp': 1695549606,
'upload_date': '20230924',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'view_count': int,
}
}]
def _real_extract(self, url): def _real_extract(self, url):
ep_id = self._match_id(url) ep_id = self._match_id(url)
return self._extract_episode( return self._extract_episode(self._download_season_info('ep_id', ep_id), ep_id)
self._download_season_info('ep_id', ep_id, self._HEADERS), ep_id, self._HEADERS)
class BilibiliCheeseSeasonIE(BilibiliCheeseIE): class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ss(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ss(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/cheese/play/ss5918', 'url': 'https://www.bilibili.com/cheese/play/ss5918',
@ -917,20 +926,21 @@ class BilibiliCheeseSeasonIE(BilibiliCheeseIE):
def _get_cheese_entries(self, season_info): def _get_cheese_entries(self, season_info):
for ep_id in traverse_obj(season_info, ('episodes', lambda _, v: v['episode_can_view'], 'id')): for ep_id in traverse_obj(season_info, ('episodes', lambda _, v: v['episode_can_view'], 'id')):
yield { yield {
**self._extract_episode(season_info, ep_id, self._HEADERS), **self._extract_episode(season_info, ep_id),
'extractor_key': BilibiliCheeseIE.ie_key(), 'extractor_key': BilibiliCheeseIE.ie_key(),
'extractor': BilibiliCheeseIE.IE_NAME, 'extractor': BilibiliCheeseIE.IE_NAME,
} }
def _real_extract(self, url): def _real_extract(self, url):
season_id = self._match_id(url) season_id = self._match_id(url)
season_info = self._download_season_info('season_id', season_id, self._HEADERS) season_info = self._download_season_info('season_id', season_id)
metainfo = traverse_obj(season_info, {
return self.playlist_result(
self._get_cheese_entries(season_info), season_id,
**traverse_obj(season_info, {
'title': ('title', {str}), 'title': ('title', {str}),
'description': ('subtitle', {str}), 'description': ('subtitle', {str}),
}) }))
return self.playlist_result(self._get_cheese_entries(season_info), season_id, **metainfo)
class BilibiliSpaceBaseIE(InfoExtractor): class BilibiliSpaceBaseIE(InfoExtractor):