Merge 0f6fb121ba into 1d253b0a27

Merge branch 'master' into biliSearchPageIE
[BiliBiliSearchBaseIE] removed an unnecessary write_debug sentence
2024-11-25 16:51:26 +01:00 · 2024-11-17 02:32:18 +05:30 · 2024-10-02 20:06:41 +13:00 · 2024-08-17 20:51:55 +12:00 · 2024-08-17 18:48:50 +12:00 · 2024-08-17 11:20:08 +12:00
2 changed files with 220 additions and 11 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -260,7 +260,9 @@ from .bilibili import (
    BiliBiliIE,
    BiliBiliPlayerIE,
    BilibiliPlaylistIE,
    BiliBiliSearchAllIE,
    BiliBiliSearchIE,
    BiliBiliSearchPageIE,
    BilibiliSeriesListIE,
    BilibiliSpaceAudioIE,
    BilibiliSpaceVideoIE,
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@ -1660,7 +1660,96 @@ class BilibiliCategoryIE(InfoExtractor):
        return self.playlist_result(self._entries(category, subcategory, query), query, query)
-class BiliBiliSearchIE(SearchInfoExtractor):
+class BiliBiliSearchBaseIE(BilibiliBaseIE):
    def _extract_search_result(self, result_data):
        live_room_prefix = 'https://live.bilibili.com/'
        bili_user_prefix = 'https://space.bilibili.com/'
        result_type = result_data.get('type')
        if result_type == 'video':
            return self.url_result(result_data['arcurl'])
        elif result_type == 'live_room':
            return self.url_result(live_room_prefix + str(result_data['roomid']))
        elif result_type in ['media_ft', 'media_bangumi']:
            return self.url_result(result_data['url'])
        elif result_type == 'bili_user':
            return self.url_result(bili_user_prefix + str(result_data['mid']))
 class BiliBiliSearchAllIE(SearchInfoExtractor, BiliBiliSearchBaseIE):
    IE_DESC = 'Bilibili all search'
    _MAX_RESULTS = 100000
    _SEARCH_KEY = 'biliallsearch'
    _TESTS = [{
        'url': 'biliallsearch3:靡烟 出道一年，我怎么还在等你单推的女人睡觉后开播啊',
        'playlist_count': 3,
        'info_dict': {
            'id': '靡烟 出道一年，我怎么还在等你单推的女人睡觉后开播啊',
            'title': '靡烟 出道一年，我怎么还在等你单推的女人睡觉后开播啊',
        },
        'playlist': [{
            'info_dict': {
                'id': 'BV1n44y1Q7sc',
                'ext': 'mp4',
                'title': '“出道一年，我怎么还在等你单推的女人睡觉后开播啊？”【一分钟了解靡烟miya】',
                'timestamp': 1669889987,
                'upload_date': '20221201',
                'description': 'md5:43343c0973defff527b5a4b403b4abf9',
                'tags': list,
                'uploader': '靡烟miya',
                'duration': 123.156,
                'uploader_id': '1958703906',
                'comment_count': int,
                'view_count': int,
                'like_count': int,
                'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
                '_old_archive_ids': ['bilibili 988222410_part1'],
            },
        }],
    }, {
        'url': 'biliallsearch:LOL',
        'playlist_count': 1,
        'info_dict': {
            'id': 'LOL',
            'title': 'LOL',
        },
    }]
    def _search_results(self, query):
        headers = self.geo_verification_headers()
        headers['Referer'] = 'https://www.bilibili.com/'
        page_size = 50
        if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
            self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
        for page_num in itertools.count(1):
            query_params = {
                'keyword': query,
                'page': page_num,
                'dynamic_offset': (page_num - 1) * page_size,
                'platform': 'pc',
            }
            api_url = r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2'
            try:
                search_all_result = self._download_json(
                    api_url, video_id=query, query=self._sign_wbi(query_params, query),
                    headers=headers,
                )
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status == 412:
                    raise ExtractorError('Request is blocked by server (-412).', expected=True)
                raise
            status_code = search_all_result['code']
            if status_code == -400:
                raise ExtractorError('Invalid request (-400).', expected=True)
            result_list = search_all_result['data'].get('result')
            if not result_list:
                self.write_debug(f'Response: {search_all_result}')
                raise ExtractorError(f'Result not found in the response ({status_code}).',
                                     expected=True)
            for result_data in traverse_obj(result_list, (..., 'data', ...)):
                yield self._extract_search_result(result_data)
 class BiliBiliSearchIE(SearchInfoExtractor, BilibiliBaseIE):
    IE_DESC = 'Bilibili video search'
    _MAX_RESULTS = 100000
    _SEARCH_KEY = 'bilisearch'
@ -1695,21 +1784,16 @@ class BiliBiliSearchIE(SearchInfoExtractor):
    def _search_results(self, query):
        if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
            self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
        headers = self.geo_verification_headers()
        headers['Referer'] = 'https://www.bilibili.com/'
        for page_num in itertools.count(1):
            videos = self._download_json(
-                'https://api.bilibili.com/x/web-interface/search/type', query,
+                'https://api.bilibili.com/x/web-interface/wbi/search/type', query,
-                note=f'Extracting results from page {page_num}', query={
+                note=f'Extracting results from page {page_num}', query=self._sign_wbi({
                    'Search_key': query,
                    'keyword': query,
                    'page': page_num,
                    'context': '',
                    'duration': 0,
                    'tids_2': '',
                    '__refresh__': 'true',
                    'search_type': 'video',
-                    'tids': 0,
+                }, query), headers=headers)['data'].get('result')
                    'highlight': 1,
                })['data'].get('result')
            if not videos:
                break
            for video in videos:
@ -2406,3 +2490,126 @@ class BiliLiveIE(InfoExtractor):
                'Referer': url,
            },
        }
 class BiliBiliSearchPageIE(BiliBiliSearchBaseIE):
    IE_DESC = 'Bilibili Search Page URL Extractor'
    _VALID_URL = r'https?://search\.bilibili\.com/(?P<type>all|video|bangumi|pgc|live|upuser).*'
    _TESTS = [{
        'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8',
        'playlist_count': 36,
        'info_dict': {
            'id': 'yt - dlp 下载器',
            'title': 'yt - dlp 下载器',
        },
    }, {
        'url': r'https://search.bilibili.com/bangumi/?keyword=%E5%AD%A4%E7%8B%AC%E6%91%87%E6%BB%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5',
        'playlist_mincount': 1,
        'info_dict': {
            'id': '孤独摇滚',
            'title': '孤独摇滚',
        },
        'skip': 'geo-restricted',
    }, {
        'url': r'https://search.bilibili.com/video?keyword=%E8%AE%A9%E5%AD%90%E5%BC%B9%E9%A3%9E&from_source=webtop_search&spm_id_from=333.1007&search_source=5&order=dm&duration=4&tids=181&page=3&o=72',
        'playlist_mincount': 4,
        'info_dict': {
            'id': '让子弹飞',
            'title': '让子弹飞',
        },
    }]
    def _real_extract(self, url):
        headers = self.geo_verification_headers()
        headers['Referer'] = url
        entries = []
        params = parse_qs(url)
        query = {
            'platform': 'pc',
            'page_size': 36,
        }
        if not self._get_cookies('https://api.bilibili.com').get('buvid3'):
            self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc')
        search_type = self._match_valid_url(url).group('type')
        raw_playlist_id = traverse_obj(params, ('keyword', 0))
        if not raw_playlist_id:
            raise ExtractorError('Please specify the keyword to search for!', expected=True)
        playlist_id = urllib.parse.unquote_plus(raw_playlist_id)
        search_type_mapping = {
            'video': 'video',
            'bangumi': 'media_bangumi',
            'pgc': 'media_ft',
            'live': 'live_room',
            'upuser': 'bili_user',
            'all': 'video',  # 'all' search calls video search after page 1
        }
        valid_params = [
            'keyword',
            'page',
            'order',
            'duration',
            'tids',
            'search_type',  # Only when searching for live_room or live_user
            'order_sort',
            'user_type',
        ]
        for valid_param in valid_params:
            param_value = traverse_obj(params, (valid_param, 0))
            if param_value is not None:
                query[valid_param] = param_value
        page_num = int(query.get('page', 1))
        param_offset = int_or_none(traverse_obj(params, ('o', 0)))
        if page_num == 1:
            query['dynamic_offset'] = 0
        elif param_offset is not None:
            query['dynamic_offset'] = param_offset
        else:
            query['dynamic_offset'] = query['page_size'] * (page_num - 1)
        if search_type == 'live' and traverse_obj(params, ('search_type', 0)) == 'live_user':
            raise ExtractorError('Live users are not downloadable!', expected=True)
        if search_type == 'all' and page_num == 1:
            try:
                search_all_result = self._download_json(
                    r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2',
                    video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers)
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status == 412:
                    raise ExtractorError('Request is blocked by server (-412).', expected=True)
                raise
            status_code = search_all_result['code']
            if status_code == -400:
                raise ExtractorError('Invalid request (-400).', expected=True)
            result_list = search_all_result['data'].get('result')
            if not result_list:
                self.write_debug(f'Response: {search_all_result}')
                raise ExtractorError(f'Result not found in the response ({status_code}).',
                                     expected=True)
            entries = [self._extract_search_result(result_data) for result_data in traverse_obj(result_list, (..., 'data', ...))]
        else:
            query = {
                'search_type': search_type_mapping[search_type],
                **query,  # search_type in type is overridden when specified in url params
            }
            try:
                search_type_result = self._download_json(
                    r'https://api.bilibili.com/x/web-interface/wbi/search/type',
                    video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers,
                )
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status == 412:
                    raise ExtractorError('Request is blocked by server (-412).')
                raise
            status_code = search_type_result['code']
            if status_code == -400:
                raise ExtractorError('Invalid request (-400).')
            result_list = search_type_result['data'].get('result')
            if not result_list:
                self.write_debug(f'Response: {search_type_result}')
                raise ExtractorError(
                    f'Result not found in the response ({status_code}). '
                    'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
            entries = [self._extract_search_result(result_data) for result_data in result_list]
        return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=playlist_id)
Author	SHA1	Message	Date
N/Ame	be6d011c43	Merge `0f6fb121ba` into `1d253b0a27`	2024-11-17 02:32:18 +05:30
grqx_termux	0f6fb121ba	Merge branch 'master' into biliSearchPageIE	2024-10-02 20:06:41 +13:00
grqx_wsl	69054b483f	[BiliBiliSearchBaseIE] removed an unnecessary write_debug sentence	2024-08-17 20:51:55 +12:00
grqx_wsl	2da3e052ed	[BiliBiliSearchBaseIE, BiliBiliSearchAllIE, BiliBiliSearchPageIE] more compact - wrapped search result extractor into a method of a baseie(BiliBiliSearchBaseIE) - used `traverse_obj` for traversal through search result instead of `for` loops Resolves https://github.com/yt-dlp/yt-dlp/pull/10317#discussion_r1703943629	2024-08-17 18:48:50 +12:00
grqx_wsl	eac8a89b47	Merge remote-tracking branch 'upstream' into biliSearchPageIE	2024-08-17 11:20:08 +12:00
grqx_wsl	b6f26805a7	revert all changes to supportedsites.md as it's automatically generated by `devscripts/make_supportedsites.py`	2024-07-21 20:29:52 +12:00
grqx_wsl	50271dbd80	add IE_DESC for BiliBiliSearchPageIE	2024-07-21 12:27:07 +12:00
grqx_wsl	853a9224f5	apply patch from comment https://github.com/yt-dlp/yt-dlp/pull/10317#issuecomment-2202289727	2024-07-19 22:38:21 +12:00
grqx_wsl	4c16680c00	(Empty commit) ci run dl all	2024-07-03 22:25:53 +12:00
grqx_wsl	cbbf6ad2cd	ci run dl	2024-07-02 19:53:58 +12:00
grqx_wsl	d079c1a67f	WBI signature	2024-07-02 18:25:34 +12:00
N/Ame	a9ac7d7f99	Merge branch 'yt-dlp:master' into biliSearchPageIE	2024-07-02 17:31:58 +12:00
N/Ame	ba46a9e0d1	Merge branch 'yt-dlp:master' into biliSearchPageIE	2024-07-01 21:02:51 +12:00
grqx_wsl	ca780a228c	Fix search key(prefix) in supportedsites.md	2024-07-01 16:32:10 +12:00
grqx_wsl	d6842fcd7f	change search key from bilisearchall to biliallsearch to disambiguate	2024-07-01 10:33:22 +12:00
grqx_wsl	ddca238423	modified: yt_dlp/extractor/bilibili.py	2024-06-30 20:53:08 +12:00
grqx_wsl	72fac58401	Supported more params	2024-06-29 20:05:53 +12:00
grqx_wsl	41bb0c6b37	Add SearchInfoExtractor: BiliBiliSearchAllIE	2024-06-28 12:10:00 +12:00
grqx_wsl	fc79d7325c	add test, add geo verification proxy support	2024-06-28 11:45:54 +12:00
grqx	31b0294bd6	Merge branch 'yt-dlp:master' into biliSearchPageIE	2024-06-28 10:42:01 +12:00
grqx	31be8d3dbd	Merge branch 'yt-dlp:master' into biliSearchPageIE	2024-06-28 00:52:00 +12:00
grqx_wsl	af0eb72a8b	uses json api, pass hatch test	2024-06-28 00:32:37 +12:00
grqx	f40c4b4ee2	stop using codespaces modified: yt_dlp/extractor/_extractors.py modified: yt_dlp/extractor/bilibili.py	2024-06-24 11:24:32 +00:00