Compare commits

...

25 Commits

Author SHA1 Message Date
N/Ame
d98146db04
Merge 0f6fb121ba into f2a4983df7 2024-11-15 07:34:16 +01:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
grqx_termux
0f6fb121ba Merge branch 'master' into biliSearchPageIE 2024-10-02 20:06:41 +13:00
grqx_wsl
69054b483f [BiliBiliSearchBaseIE] removed an unnecessary write_debug sentence 2024-08-17 20:51:55 +12:00
grqx_wsl
2da3e052ed [BiliBiliSearchBaseIE, BiliBiliSearchAllIE, BiliBiliSearchPageIE] more compact
- wrapped search result extractor into a method of a baseie(BiliBiliSearchBaseIE)
- used `traverse_obj` for traversal through search result instead of `for` loops

Resolves https://github.com/yt-dlp/yt-dlp/pull/10317#discussion_r1703943629
2024-08-17 18:48:50 +12:00
grqx_wsl
eac8a89b47 Merge remote-tracking branch 'upstream' into biliSearchPageIE 2024-08-17 11:20:08 +12:00
grqx_wsl
b6f26805a7 revert all changes to supportedsites.md as it's automatically generated by devscripts/make_supportedsites.py 2024-07-21 20:29:52 +12:00
grqx_wsl
50271dbd80 add IE_DESC for BiliBiliSearchPageIE 2024-07-21 12:27:07 +12:00
grqx_wsl
853a9224f5 apply patch from comment https://github.com/yt-dlp/yt-dlp/pull/10317#issuecomment-2202289727 2024-07-19 22:38:21 +12:00
grqx_wsl
4c16680c00 (Empty commit) ci run dl all 2024-07-03 22:25:53 +12:00
grqx_wsl
cbbf6ad2cd ci run dl 2024-07-02 19:53:58 +12:00
grqx_wsl
d079c1a67f WBI signature 2024-07-02 18:25:34 +12:00
N/Ame
a9ac7d7f99
Merge branch 'yt-dlp:master' into biliSearchPageIE 2024-07-02 17:31:58 +12:00
N/Ame
ba46a9e0d1
Merge branch 'yt-dlp:master' into biliSearchPageIE 2024-07-01 21:02:51 +12:00
grqx_wsl
ca780a228c Fix search key(prefix) in supportedsites.md 2024-07-01 16:32:10 +12:00
grqx_wsl
d6842fcd7f change search key from bilisearchall to biliallsearch to disambiguate 2024-07-01 10:33:22 +12:00
grqx_wsl
ddca238423 modified: yt_dlp/extractor/bilibili.py 2024-06-30 20:53:08 +12:00
grqx_wsl
72fac58401 Supported more params 2024-06-29 20:05:53 +12:00
grqx_wsl
41bb0c6b37 Add SearchInfoExtractor: BiliBiliSearchAllIE 2024-06-28 12:10:00 +12:00
grqx_wsl
fc79d7325c add test, add geo verification proxy support 2024-06-28 11:45:54 +12:00
grqx
31b0294bd6
Merge branch 'yt-dlp:master' into biliSearchPageIE 2024-06-28 10:42:01 +12:00
grqx
31be8d3dbd
Merge branch 'yt-dlp:master' into biliSearchPageIE 2024-06-28 00:52:00 +12:00
grqx_wsl
af0eb72a8b uses json api, pass hatch test 2024-06-28 00:32:37 +12:00
grqx
f40c4b4ee2 stop using codespaces
modified:   yt_dlp/extractor/_extractors.py
	modified:   yt_dlp/extractor/bilibili.py
2024-06-24 11:24:32 +00:00
4 changed files with 271 additions and 17 deletions

View File

@ -256,7 +256,9 @@ from .bilibili import (
BiliBiliIE,
BiliBiliPlayerIE,
BilibiliPlaylistIE,
BiliBiliSearchAllIE,
BiliBiliSearchIE,
BiliBiliSearchPageIE,
BilibiliSeriesListIE,
BilibiliSpaceAudioIE,
BilibiliSpaceVideoIE,

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
},
},
],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}]
@staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({
'id': review.get('review_id'),
'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'})

View File

@ -1660,7 +1660,96 @@ class BilibiliCategoryIE(InfoExtractor):
return self.playlist_result(self._entries(category, subcategory, query), query, query)
class BiliBiliSearchIE(SearchInfoExtractor):
class BiliBiliSearchBaseIE(BilibiliBaseIE):
def _extract_search_result(self, result_data):
live_room_prefix = 'https://live.bilibili.com/'
bili_user_prefix = 'https://space.bilibili.com/'
result_type = result_data.get('type')
if result_type == 'video':
return self.url_result(result_data['arcurl'])
elif result_type == 'live_room':
return self.url_result(live_room_prefix + str(result_data['roomid']))
elif result_type in ['media_ft', 'media_bangumi']:
return self.url_result(result_data['url'])
elif result_type == 'bili_user':
return self.url_result(bili_user_prefix + str(result_data['mid']))
class BiliBiliSearchAllIE(SearchInfoExtractor, BiliBiliSearchBaseIE):
IE_DESC = 'Bilibili all search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'biliallsearch'
_TESTS = [{
'url': 'biliallsearch3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
'playlist_count': 3,
'info_dict': {
'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
'title': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊',
},
'playlist': [{
'info_dict': {
'id': 'BV1n44y1Q7sc',
'ext': 'mp4',
'title': '“出道一年我怎么还在等你单推的女人睡觉后开播啊”【一分钟了解靡烟miya】',
'timestamp': 1669889987,
'upload_date': '20221201',
'description': 'md5:43343c0973defff527b5a4b403b4abf9',
'tags': list,
'uploader': '靡烟miya',
'duration': 123.156,
'uploader_id': '1958703906',
'comment_count': int,
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 988222410_part1'],
},
}],
}, {
'url': 'biliallsearch:LOL',
'playlist_count': 1,
'info_dict': {
'id': 'LOL',
'title': 'LOL',
},
}]
def _search_results(self, query):
headers = self.geo_verification_headers()
headers['Referer'] = 'https://www.bilibili.com/'
page_size = 50
if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
for page_num in itertools.count(1):
query_params = {
'keyword': query,
'page': page_num,
'dynamic_offset': (page_num - 1) * page_size,
'platform': 'pc',
}
api_url = r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2'
try:
search_all_result = self._download_json(
api_url, video_id=query, query=self._sign_wbi(query_params, query),
headers=headers,
)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 412:
raise ExtractorError('Request is blocked by server (-412).', expected=True)
raise
status_code = search_all_result['code']
if status_code == -400:
raise ExtractorError('Invalid request (-400).', expected=True)
result_list = search_all_result['data'].get('result')
if not result_list:
self.write_debug(f'Response: {search_all_result}')
raise ExtractorError(f'Result not found in the response ({status_code}).',
expected=True)
for result_data in traverse_obj(result_list, (..., 'data', ...)):
yield self._extract_search_result(result_data)
class BiliBiliSearchIE(SearchInfoExtractor, BilibiliBaseIE):
IE_DESC = 'Bilibili video search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'bilisearch'
@ -1695,21 +1784,16 @@ class BiliBiliSearchIE(SearchInfoExtractor):
def _search_results(self, query):
if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
headers = self.geo_verification_headers()
headers['Referer'] = 'https://www.bilibili.com/'
for page_num in itertools.count(1):
videos = self._download_json(
'https://api.bilibili.com/x/web-interface/search/type', query,
note=f'Extracting results from page {page_num}', query={
'Search_key': query,
'https://api.bilibili.com/x/web-interface/wbi/search/type', query,
note=f'Extracting results from page {page_num}', query=self._sign_wbi({
'keyword': query,
'page': page_num,
'context': '',
'duration': 0,
'tids_2': '',
'__refresh__': 'true',
'search_type': 'video',
'tids': 0,
'highlight': 1,
})['data'].get('result')
}, query), headers=headers)['data'].get('result')
if not videos:
break
for video in videos:
@ -2406,3 +2490,126 @@ class BiliLiveIE(InfoExtractor):
'Referer': url,
},
}
class BiliBiliSearchPageIE(BiliBiliSearchBaseIE):
IE_DESC = 'Bilibili Search Page URL Extractor'
_VALID_URL = r'https?://search\.bilibili\.com/(?P<type>all|video|bangumi|pgc|live|upuser).*'
_TESTS = [{
'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8',
'playlist_count': 36,
'info_dict': {
'id': 'yt - dlp 下载器',
'title': 'yt - dlp 下载器',
},
}, {
'url': r'https://search.bilibili.com/bangumi/?keyword=%E5%AD%A4%E7%8B%AC%E6%91%87%E6%BB%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5',
'playlist_mincount': 1,
'info_dict': {
'id': '孤独摇滚',
'title': '孤独摇滚',
},
'skip': 'geo-restricted',
}, {
'url': r'https://search.bilibili.com/video?keyword=%E8%AE%A9%E5%AD%90%E5%BC%B9%E9%A3%9E&from_source=webtop_search&spm_id_from=333.1007&search_source=5&order=dm&duration=4&tids=181&page=3&o=72',
'playlist_mincount': 4,
'info_dict': {
'id': '让子弹飞',
'title': '让子弹飞',
},
}]
def _real_extract(self, url):
headers = self.geo_verification_headers()
headers['Referer'] = url
entries = []
params = parse_qs(url)
query = {
'platform': 'pc',
'page_size': 36,
}
if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
search_type = self._match_valid_url(url).group('type')
raw_playlist_id = traverse_obj(params, ('keyword', 0))
if not raw_playlist_id:
raise ExtractorError('Please specify the keyword to search for!', expected=True)
playlist_id = urllib.parse.unquote_plus(raw_playlist_id)
search_type_mapping = {
'video': 'video',
'bangumi': 'media_bangumi',
'pgc': 'media_ft',
'live': 'live_room',
'upuser': 'bili_user',
'all': 'video', # 'all' search calls video search after page 1
}
valid_params = [
'keyword',
'page',
'order',
'duration',
'tids',
'search_type', # Only when searching for live_room or live_user
'order_sort',
'user_type',
]
for valid_param in valid_params:
param_value = traverse_obj(params, (valid_param, 0))
if param_value is not None:
query[valid_param] = param_value
page_num = int(query.get('page', 1))
param_offset = int_or_none(traverse_obj(params, ('o', 0)))
if page_num == 1:
query['dynamic_offset'] = 0
elif param_offset is not None:
query['dynamic_offset'] = param_offset
else:
query['dynamic_offset'] = query['page_size'] * (page_num - 1)
if search_type == 'live' and traverse_obj(params, ('search_type', 0)) == 'live_user':
raise ExtractorError('Live users are not downloadable!', expected=True)
if search_type == 'all' and page_num == 1:
try:
search_all_result = self._download_json(
r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2',
video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 412:
raise ExtractorError('Request is blocked by server (-412).', expected=True)
raise
status_code = search_all_result['code']
if status_code == -400:
raise ExtractorError('Invalid request (-400).', expected=True)
result_list = search_all_result['data'].get('result')
if not result_list:
self.write_debug(f'Response: {search_all_result}')
raise ExtractorError(f'Result not found in the response ({status_code}).',
expected=True)
entries = [self._extract_search_result(result_data) for result_data in traverse_obj(result_list, (..., 'data', ...))]
else:
query = {
'search_type': search_type_mapping[search_type],
**query, # search_type in type is overridden when specified in url params
}
try:
search_type_result = self._download_json(
r'https://api.bilibili.com/x/web-interface/wbi/search/type',
video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers,
)
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 412:
raise ExtractorError('Request is blocked by server (-412).')
raise
status_code = search_type_result['code']
if status_code == -400:
raise ExtractorError('Invalid request (-400).')
result_list = search_type_result['data'].get('result')
if not result_list:
self.write_debug(f'Response: {search_type_result}')
raise ExtractorError(
f'Result not found in the response ({status_code}). '
'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
entries = [self._extract_search_result(result_data) for result_data in result_list]
return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=playlist_id)

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats):
def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str)
vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest:
formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url'))))
mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = []
q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url:
continue
if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id))
formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else:
formats.append({
'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url,
})
extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats:
# Do not append false positive entry w/o any formats
return