Compare commits

...

6 Commits

Author SHA1 Message Date
Rohan Dey
cf11b40ac4
[ie/media.ccc.de:lists] Fix extraction (#8144)
Closes #8138
Authored by: Rohxn16
2023-09-18 23:39:20 +00:00
niemands
40999467f7
[ie/pornbox] Add extractor (#7386)
Authored by: niemands
2023-09-18 23:37:17 +00:00
u-spec-png
8ac5b6d96a
[ie/N1Info:article] Fix extractor (#7373)
Authored by: u-spec-png
2023-09-18 23:36:10 +00:00
c-basalt
69b03f84f8
[ie/weibo] Fix extractor and support user extraction (#7657)
Closes #3964, Closes #4673, Closes #6979
Authored by: c-basalt
2023-09-18 23:06:36 +00:00
c-basalt
9e68747f96
[ie/bilibili] Add support for series, favorites and watch later (#7518)
Closes #6719
Authored by: c-basalt
2023-09-18 23:02:00 +00:00
Elyse
ba8e9eb2c8
[ie/radiofrance] Add support for livestreams, podcasts, playlists (#7006)
Closes #4282
Authored by: elyse0
2023-09-18 21:08:40 +00:00
7 changed files with 1030 additions and 142 deletions

View File

@ -223,7 +223,11 @@ from .bilibili import (
BiliBiliPlayerIE, BiliBiliPlayerIE,
BilibiliSpaceVideoIE, BilibiliSpaceVideoIE,
BilibiliSpaceAudioIE, BilibiliSpaceAudioIE,
BilibiliSpacePlaylistIE, BilibiliCollectionListIE,
BilibiliSeriesListIE,
BilibiliFavoritesListIE,
BilibiliWatchlaterIE,
BilibiliPlaylistIE,
BiliIntlIE, BiliIntlIE,
BiliIntlSeriesIE, BiliIntlSeriesIE,
BiliLiveIE, BiliLiveIE,
@ -1501,6 +1505,7 @@ from .polskieradio import (
from .popcorntimes import PopcorntimesIE from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE from .popcorntv import PopcornTVIE
from .porn91 import Porn91IE from .porn91 import Porn91IE
from .pornbox import PornboxIE
from .porncom import PornComIE from .porncom import PornComIE
from .pornflip import PornFlipIE from .pornflip import PornFlipIE
from .pornhd import PornHdIE from .pornhd import PornHdIE
@ -1555,7 +1560,14 @@ from .radiocanada import (
from .radiode import RadioDeIE from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE from .radiobremen import RadioBremenIE
from .radiofrance import FranceCultureIE, RadioFranceIE from .radiofrance import (
FranceCultureIE,
RadioFranceIE,
RadioFranceLiveIE,
RadioFrancePodcastIE,
RadioFranceProfileIE,
RadioFranceProgramScheduleIE,
)
from .radiozet import RadioZetPodcastIE from .radiozet import RadioZetPodcastIE
from .radiokapital import ( from .radiokapital import (
RadioKapitalIE, RadioKapitalIE,
@ -2360,7 +2372,8 @@ from .webofstories import (
) )
from .weibo import ( from .weibo import (
WeiboIE, WeiboIE,
WeiboMobileIE WeiboVideoIE,
WeiboUserIE,
) )
from .weiqitv import WeiqiTVIE from .weiqitv import WeiqiTVIE
from .weverse import ( from .weverse import (

View File

@ -15,6 +15,7 @@ from ..utils import (
GeoRestrictedError, GeoRestrictedError,
InAdvancePagedList, InAdvancePagedList,
OnDemandPagedList, OnDemandPagedList,
bool_or_none,
filter_dict, filter_dict,
float_or_none, float_or_none,
format_field, format_field,
@ -35,6 +36,7 @@ from ..utils import (
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
variadic,
) )
@ -156,7 +158,7 @@ class BilibiliBaseIE(InfoExtractor):
class BiliBiliIE(BilibiliBaseIE): class BiliBiliIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/video/BV13x41117TL', 'url': 'https://www.bilibili.com/video/BV13x41117TL',
@ -252,7 +254,7 @@ class BiliBiliIE(BilibiliBaseIE):
'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
'duration': 313.557, 'duration': 313.557,
'upload_date': '20220709', 'upload_date': '20220709',
'uploader': '小夫Tech', 'uploader': '小夫太渴',
'timestamp': 1657347907, 'timestamp': 1657347907,
'uploader_id': '1326814124', 'uploader_id': '1326814124',
'comment_count': int, 'comment_count': int,
@ -509,7 +511,7 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
class BiliBiliBangumiMediaIE(BilibiliBaseIE): class BiliBiliBangumiMediaIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'url': 'https://www.bilibili.com/bangumi/media/md24097891',
'info_dict': { 'info_dict': {
@ -528,7 +530,7 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE):
class BiliBiliBangumiSeasonIE(BilibiliBaseIE): class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
_VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)' _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'url': 'https://www.bilibili.com/bangumi/play/ss26801',
'info_dict': { 'info_dict': {
@ -679,13 +681,35 @@ class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
return self.playlist_result(paged_list, playlist_id) return self.playlist_result(paged_list, playlist_id)
class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)' def _get_entries(self, page_data, bvid_keys, ending_key='bvid'):
for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})):
yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid)
def _get_uploader(self, uid, playlist_id):
webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False)
return self._search_regex(r'(?s)<title\b[^>]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False)
def _extract_playlist(self, fetch_page, get_metadata, get_entries):
metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries)
metadata.pop('page_count', None)
metadata.pop('page_size', None)
return metadata, page_list
class BilibiliCollectionListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail/?\?sid=(?P<sid>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
'info_dict': { 'info_dict': {
'id': '2142762_57445', 'id': '2142762_57445',
'title': '《底特律 变人》' 'title': '【完结】《底特律 变人》全结局流程解说',
'description': '',
'uploader': '老戴在此',
'uploader_id': '2142762',
'timestamp': int,
'upload_date': str,
'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg',
}, },
'playlist_mincount': 31, 'playlist_mincount': 31,
}] }]
@ -706,22 +730,251 @@ class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
return { return {
'page_count': math.ceil(entry_count / page_size), 'page_count': math.ceil(entry_count / page_size),
'page_size': page_size, 'page_size': page_size,
'title': traverse_obj(page_data, ('meta', 'name')) 'uploader': self._get_uploader(mid, playlist_id),
**traverse_obj(page_data, {
'title': ('meta', 'name', {str}),
'description': ('meta', 'description', {str}),
'uploader_id': ('meta', 'mid', {str_or_none}),
'timestamp': ('meta', 'ptime', {int_or_none}),
'thumbnail': ('meta', 'cover', {url_or_none}),
})
} }
def get_entries(page_data): def get_entries(page_data):
for entry in page_data.get('archives', []): return self._get_entries(page_data, 'archives')
yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
BiliBiliIE, entry['bvid'])
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, metadata['title']) return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliSeriesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/seriesdetail/?\?\bsid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0',
'info_dict': {
'id': '1958703906_547718',
'title': '直播回放',
'description': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
'modified_timestamp': int,
'modified_date': str,
},
'playlist_mincount': 513,
}]
def _real_extract(self, url):
mid, sid = self._match_valid_url(url).group('mid', 'sid')
playlist_id = f'{mid}_{sid}'
playlist_meta = traverse_obj(self._download_json(
f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False
), {
'title': ('data', 'meta', 'name', {str}),
'description': ('data', 'meta', 'description', {str}),
'uploader_id': ('data', 'meta', 'mid', {str_or_none}),
'timestamp': ('data', 'meta', 'ctime', {int_or_none}),
'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}),
})
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/x/series/archives',
playlist_id, note=f'Downloading page {page_idx}',
query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data']
def get_metadata(page_data):
page_size = page_data['page']['size']
entry_count = page_data['page']['total']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'uploader': self._get_uploader(mid, playlist_id),
**playlist_meta
}
def get_entries(page_data):
return self._get_entries(page_data, 'archives')
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P<id>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create',
'info_dict': {
'id': '1103407912',
'title': '【V2】',
'description': '',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'modified_timestamp': int,
'modified_date': str,
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
'view_count': int,
'like_count': int,
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/detail/ml1103407912',
'only_matching': True,
}]
def _real_extract(self, url):
fid = self._match_id(url)
list_info = self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20',
fid, note='Downloading favlist metadata')
if list_info['code'] == -403:
self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner')
entries = self._get_entries(self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}',
fid, note='Download favlist entries'), 'data')
return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', {
'title': ('title', {str}),
'description': ('intro', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'modified_timestamp': ('mtime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
'view_count': ('cnt_info', 'play', {int_or_none}),
'like_count': ('cnt_info', 'thumb_up', {int_or_none}),
})))
class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.bilibili.com/watchlater/#/list',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _real_extract(self, url):
list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater')
watchlater_info = self._download_json(
'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id)
if watchlater_info['code'] == -101:
self.raise_login_required(msg='You need to login to access your watchlater list')
entries = self._get_entries(watchlater_info, ('data', 'list'))
return self.playlist_result(entries, id=list_id, title='稍后再看')
class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.bilibili.com/list/1958703906?sid=547718',
'info_dict': {
'id': '5_547718',
'title': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
},
'playlist_mincount': 513,
}, {
'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
'info_dict': {
'id': '5_547718',
},
'playlist_mincount': 513,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/ml1103407912',
'info_dict': {
'id': '3_1103407912',
'title': '【V2】',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/play/ml1103407912',
'info_dict': {
'id': '3_1103407912',
},
'playlist_mincount': 22,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}, {
'url': 'https://www.bilibili.com/medialist/play/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _extract_medialist(self, query, list_id):
for page_num in itertools.count(1):
page_data = self._download_json(
'https://api.bilibili.com/x/v2/medialist/resource/list',
list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}'
)['data']
yield from self._get_entries(page_data, 'media_list', ending_key='bv_id')
query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id'))
if not page_data.get('has_more', False):
break
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none}))
error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none}))
if error_code == -400 and list_id == 'watchlater':
self.raise_login_required('You need to login to access your watchlater playlist')
elif error_code == -403:
self.raise_login_required('This is a private playlist. You need to login as its owner')
elif error_code == 11010:
raise ExtractorError('Playlist is no longer available', expected=True)
raise ExtractorError(f'Could not access playlist: {error_code} {error_message}')
query = {
'ps': 20,
'with_current': False,
**traverse_obj(initial_state, {
'type': ('playlist', 'type', {int_or_none}),
'biz_id': ('playlist', 'id', {int_or_none}),
'tid': ('tid', {int_or_none}),
'sort_field': ('sortFiled', {int_or_none}),
'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}),
})
}
metadata = {
'id': f'{query["type"]}_{query["biz_id"]}',
**traverse_obj(initial_state, ('mediaListInfo', {
'title': ('title', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
})),
}
return self.playlist_result(self._extract_medialist(query, list_id), **metadata)
class BilibiliCategoryIE(InfoExtractor): class BilibiliCategoryIE(InfoExtractor):
IE_NAME = 'Bilibili category extractor' IE_NAME = 'Bilibili category extractor'
_MAX_RESULTS = 1000000 _MAX_RESULTS = 1000000
_VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/v/kichiku/mad', 'url': 'https://www.bilibili.com/v/kichiku/mad',
'info_dict': { 'info_dict': {
@ -1406,7 +1659,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
class BiliLiveIE(InfoExtractor): class BiliLiveIE(InfoExtractor):
_VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)' _VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://live.bilibili.com/196', 'url': 'https://live.bilibili.com/196',

View File

@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor):
'id': '30c3', 'id': '30c3',
}, },
'playlist_count': 135, 'playlist_count': 135,
}, {
'url': 'https://media.ccc.de/c/DS2023',
'info_dict': {
'title': 'Datenspuren 2023',
'id': 'DS2023',
},
'playlist_count': 37
}] }]
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url).lower() playlist_id = self._match_id(url)
conf = self._download_json( conf = self._download_json(
'https://media.ccc.de/public/conferences/' + playlist_id, 'https://media.ccc.de/public/conferences/' + playlist_id,

View File

@ -33,7 +33,7 @@ class N1InfoAssetIE(InfoExtractor):
class N1InfoIIE(InfoExtractor): class N1InfoIIE(InfoExtractor):
IE_NAME = 'N1Info:article' IE_NAME = 'N1Info:article'
_VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
# Youtube embedded # Youtube embedded
'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/',
@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor):
'upload_date': '20211102', 'upload_date': '20211102',
'timestamp': 1635861677, 'timestamp': 1635861677,
}, },
}, {
'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/',
'info_dict': {
'id': '1332368',
'ext': 'mp4',
'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama',
'upload_date': '20230620',
'timestamp': 1687290536,
'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg'
},
}, { }, {
'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/',
'only_matching': True, 'only_matching': True,
@ -105,9 +115,24 @@ class N1InfoIIE(InfoExtractor):
title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title') title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title')
timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage))
plugin_data = self._html_search_meta('BridPlugin', webpage)
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
entries = [] entries = []
if plugin_data:
site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id')
for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage):
video_id = self._parse_json(video_data, title)['video']
entries.append({
'id': video_id,
'title': title,
'timestamp': timestamp,
'thumbnail': self._html_search_meta('thumbnailURL', webpage),
'formats': self._extract_m3u8_formats(
f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8',
video_id, fatal=False),
})
else:
# Old player still present in older articles
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
for video in videos: for video in videos:
video_data = extract_attributes(video) video_data = extract_attributes(video)
entries.append({ entries.append({
@ -117,7 +142,8 @@ class N1InfoIIE(InfoExtractor):
'title': title, 'title': title,
'thumbnail': video_data.get('data-thumbnail'), 'thumbnail': video_data.get('data-thumbnail'),
'timestamp': timestamp, 'timestamp': timestamp,
'ie_key': 'N1InfoAsset'}) 'ie_key': 'N1InfoAsset',
})
embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage)
for embedded_video in embedded_videos: for embedded_video in embedded_videos:

113
yt_dlp/extractor/pornbox.py Normal file
View File

@ -0,0 +1,113 @@
from .common import InfoExtractor
from ..compat import functools
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
qualities,
str_or_none,
traverse_obj,
url_or_none,
)
class PornboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://pornbox.com/application/watch-page/212108',
'md5': '3ff6b6e206f263be4c5e987a3162ac6e',
'info_dict': {
'id': '212108',
'ext': 'mp4',
'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49',
'uploader': 'Lily Strong',
'timestamp': 1665871200,
'upload_date': '20221015',
'age_limit': 18,
'availability': 'needs_auth',
'duration': 1505,
'cast': ['Lily Strong', 'John Strong'],
'tags': 'count:11',
'description': 'md5:589c7f33e183aa8aa939537300efb859',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$'
}
}, {
'url': 'https://pornbox.com/application/watch-page/216045',
'info_dict': {
'id': '216045',
'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2',
'description': 'md5:3e631dcaac029f15ed434e402d1b06c7',
'uploader': 'VK Studio',
'timestamp': 1618264800,
'upload_date': '20210412',
'age_limit': 18,
'availability': 'premium_only',
'duration': 2710,
'cast': 'count:3',
'tags': 'count:29',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$',
'subtitles': 'count:6'
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True
},
'expected_warnings': [
'You are either not logged in or do not have access to this scene',
'No video formats found', 'Requested format is not available']
}]
def _real_extract(self, url):
video_id = self._match_id(url)
public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id)
subtitles = {country_code: [{
'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}',
'ext': 'srt'
}] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))}
is_free_scene = traverse_obj(
public_data, ('price', 'is_available_for_free', {bool}), default=False)
metadata = {
'id': video_id,
**traverse_obj(public_data, {
'title': ('scene_name', {str.strip}),
'description': ('small_description', {str.strip}),
'uploader': 'studio',
'duration': ('runtime', {parse_duration}),
'cast': (('models', 'male_models'), ..., 'model_name'),
'thumbnail': ('player_poster', {url_or_none}),
'tags': ('niches', ..., 'niche'),
}),
'age_limit': 18,
'timestamp': parse_iso8601(traverse_obj(
public_data, ('studios', 'release_date'), 'publish_date')),
'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene),
'subtitles': subtitles,
}
if not public_data.get('is_purchased') or not is_free_scene:
self.raise_login_required(
'You are either not logged in or do not have access to this scene', metadata_available=True)
return metadata
media_id = traverse_obj(public_data, (
'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False)
if not media_id:
self.raise_no_formats('Could not find stream id', video_id=video_id)
stream_data = self._download_json(
f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls')
get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k'])
metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], {
'url': 'src',
'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
'format_id': ('quality', {str_or_none}),
'quality': ('quality', {get_quality}),
'width': ('size', {lambda x: int(x[:-1])}),
}))
return metadata

View File

@ -1,7 +1,18 @@
import itertools
import re import re
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_duration, unified_strdate from ..utils import (
int_or_none,
join_nonempty,
js_to_json,
parse_duration,
strftime_or_none,
traverse_obj,
unified_strdate,
urljoin,
)
class RadioFranceIE(InfoExtractor): class RadioFranceIE(InfoExtractor):
@ -56,8 +67,32 @@ class RadioFranceIE(InfoExtractor):
} }
class FranceCultureIE(InfoExtractor): class RadioFranceBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])' _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
_STATIONS_RE = '|'.join(map(re.escape, (
'franceculture',
'franceinfo',
'franceinter',
'francemusique',
'fip',
'mouv',
)))
def _extract_data_from_webpage(self, webpage, display_id, key):
return traverse_obj(self._search_json(
r'\bconst\s+data\s*=', webpage, key, display_id,
contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json),
(..., 'data', key, {dict}), get_all=False) or {}
class FranceCultureIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
'''
_TESTS = [ _TESTS = [
{ {
'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor):
'ext': 'mp3', 'ext': 'mp3',
'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?', 'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?',
'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?', 'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?',
'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20220514', 'upload_date': '20220514',
'duration': 2750, 'duration': 2750,
}, },
}, },
{
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
'info_dict': {
'id': '2107675',
'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
'description': 'md5:36ee74351ede77a314fdebb94026b916',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20230310',
'duration': 8977,
'ext': 'mp3',
},
},
{ {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
'only_matching': True,
} }
] ]
@ -89,7 +140,6 @@ class FranceCultureIE(InfoExtractor):
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'url': video_data['contentUrl'], 'url': video_data['contentUrl'],
'ext': video_data.get('encodingFormat'),
'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
'duration': parse_duration(video_data.get('duration')), 'duration': parse_duration(video_data.get('duration')),
'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
@ -102,3 +152,322 @@ class FranceCultureIE(InfoExtractor):
'upload_date': unified_strdate(self._search_regex( 'upload_date': unified_strdate(self._search_regex(
r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
} }
class RadioFranceLiveIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
https?://(?:www\.)?radiofrance\.fr
/(?P<id>{RadioFranceBaseIE._STATIONS_RE})
/?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/',
'info_dict': {
'id': 'franceinter',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/franceculture',
'info_dict': {
'id': 'franceculture',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
'info_dict': {
'id': 'mouv-radio-musique-kids-family',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
'info_dict': {
'id': 'mouv-radio-rnb-soul',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
'info_dict': {
'id': 'mouv-radio-musique-mix',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/fip/radio-rock',
'info_dict': {
'id': 'fip-radio-rock',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv',
'only_matching': True,
}]
def _real_extract(self, url):
station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
if substation_id:
webpage = self._download_webpage(url, station_id)
api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
else:
api_response = self._download_json(
f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
formats, subtitles = [], {}
for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
if media_source.get('format') == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': media_source['url'],
'abr': media_source.get('bitrate'),
})
return {
'id': join_nonempty(station_id, substation_id),
'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
'formats': formats,
'subtitles': subtitles,
'is_live': True,
}
class RadioFrancePlaylistBase(RadioFranceBaseIE):
"""Subclasses must set _METADATA_KEY"""
def _call_api(self, content_id, cursor, page_num):
raise NotImplementedError('This method must be implemented by subclasses')
def _generate_playlist_entries(self, content_id, content_response):
for page_num in itertools.count(2):
for entry in content_response['items']:
yield self.url_result(
f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
'title': 'title',
'description': 'standFirst',
'timestamp': ('publishedDate', {int_or_none}),
'thumbnail': ('visual', 'src'),
}))
next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
if not next_cursor:
break
content_response = self._call_api(content_id, next_cursor, page_num)
def _real_extract(self, url):
display_id = self._match_id(url)
metadata = self._download_json(
'https://www.radiofrance.fr/api/v2.1/path', display_id,
query={'value': urllib.parse.urlparse(url).path})['content']
content_id = metadata['id']
return self.playlist_result(
self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
display_id=display_id, **{**traverse_obj(metadata, {
'title': 'title',
'description': 'standFirst',
'thumbnail': ('visual', 'src'),
}), **traverse_obj(metadata, {
'title': 'name',
'description': 'role',
})})
class RadioFrancePodcastIE(RadioFrancePlaylistBase):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
'info_dict': {
'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
'display_id': 'le-billet-vert',
'title': 'Le billet sciences',
'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 11,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
'info_dict': {
'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
'display_id': 'jean-marie-le-pen-l-obsession-nationale',
'title': 'Jean-Marie Le Pen, l\'obsession nationale',
'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_count': 7,
}, {
'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
'info_dict': {
'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
'display_id': 'serie-thomas-grjebine',
'title': 'Thomas Grjebine',
},
'playlist_count': 1,
}, {
'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
'info_dict': {
'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
'display_id': 'certains-l-aiment-fip',
'title': 'Certains laiment Fip',
'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 321,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
'only_matching': True,
}]
_METADATA_KEY = 'expressions'
def _call_api(self, podcast_id, cursor, page_num):
return self._download_json(
f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
note=f'Downloading page {page_num}', query={'pageCursor': cursor})
class RadioFranceProfileIE(RadioFrancePlaylistBase):
_VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
'info_dict': {
'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
'display_id': 'thomas-pesquet',
'title': 'Thomas Pesquet',
'description': 'Astronaute à l\'agence spatiale européenne',
},
'playlist_mincount': 212,
}, {
'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
'info_dict': {
'id': '9593050b-0183-4972-a0b5-d8f699079e02',
'display_id': 'eugenie-bastie',
'title': 'Eugénie Bastié',
'description': 'Journaliste et essayiste',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 39,
}, {
'url': 'https://www.radiofrance.fr/personnes/lea-salame',
'only_matching': True,
}]
_METADATA_KEY = 'documents'
def _call_api(self, profile_id, cursor, page_num):
resp = self._download_json(
f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
note=f'Downloading page {page_num}', query={
'relation': 'personality',
'cursor': cursor,
})
resp['next'] = traverse_obj(resp, ('pagination', 'next'))
return resp
class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?P<station>{RadioFranceBaseIE._STATIONS_RE})
/grille-programmes(?:\?date=(?P<date>[\d-]+))?
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
'info_dict': {
'id': 'franceinter-program-20230217',
'upload_date': '20230217',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
'info_dict': {
'id': 'franceculture-program-20230201',
'upload_date': '20230201',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
'info_dict': {
'id': 'mouv-program-20230319',
'upload_date': '20230319',
},
'playlist_count': 3,
}, {
'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
'info_dict': {
'id': 'francemusique-program-20230318',
'upload_date': '20230318',
},
'playlist_count': 15,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
'only_matching': True,
}]
def _generate_playlist_entries(self, webpage_url, api_response):
for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
yield self.url_result(
urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
url_transparent=True, **traverse_obj(entry, {
'title': ('expression', 'title'),
'thumbnail': ('expression', 'visual', 'src'),
'timestamp': ('startTime', {int_or_none}),
'series_id': ('concept', 'id'),
'series': ('concept', 'title'),
}))
def _real_extract(self, url):
station, date = self._match_valid_url(url).group('station', 'date')
webpage = self._download_webpage(url, station)
grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
return self.playlist_result(
self._generate_playlist_entries(url, grid_data),
join_nonempty(station, 'program', upload_date), upload_date=upload_date)

View File

@ -1,134 +1,241 @@
from .common import InfoExtractor
import json
import random import random
import re import itertools
import urllib.parse
from ..compat import ( from .common import InfoExtractor
compat_parse_qs,
compat_str,
)
from ..utils import ( from ..utils import (
js_to_json, int_or_none,
make_archive_id,
mimetype2ext,
parse_resolution,
str_or_none,
strip_jsonp, strip_jsonp,
traverse_obj,
url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin,
) )
class WeiboIE(InfoExtractor): class WeiboBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)' def _update_visitor_cookies(self, video_id):
_TEST = {
'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment',
'info_dict': {
'id': 'Fp6RGfbff',
'ext': 'mp4',
'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
# to get Referer url for genvisitor
webpage, urlh = self._download_webpage_handle(url, video_id)
visitor_url = urlh.url
if 'passport.weibo.com' in visitor_url:
# first visit
visitor_data = self._download_json( visitor_data = self._download_json(
'https://passport.weibo.com/visitor/genvisitor', video_id, 'https://passport.weibo.com/visitor/genvisitor', video_id,
note='Generating first-visit data', note='Generating first-visit guest request',
transform_source=strip_jsonp, transform_source=strip_jsonp,
headers={'Referer': visitor_url},
data=urlencode_postdata({ data=urlencode_postdata({
'cb': 'gen_callback', 'cb': 'gen_callback',
'fp': json.dumps({ 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}',
'os': '2',
'browser': 'Gecko57,0,0,0',
'fonts': 'undefined',
'screenInfo': '1440*900*24',
'plugins': '',
}),
})) }))
tid = visitor_data['data']['tid']
cnfd = '%03d' % visitor_data['data']['confidence']
self._download_webpage( self._download_webpage(
'https://passport.weibo.com/visitor/visitor', video_id, 'https://passport.weibo.com/visitor/visitor', video_id,
note='Running first-visit callback', note='Running first-visit callback to get guest cookies',
query={ query={
'a': 'incarnate', 'a': 'incarnate',
't': tid, 't': visitor_data['data']['tid'],
'w': 2, 'w': 2,
'c': cnfd, 'c': '%03d' % visitor_data['data']['confidence'],
'cb': 'cross_domain', 'cb': 'cross_domain',
'from': 'weibo', 'from': 'weibo',
'_rand': random.random(), '_rand': random.random(),
}) })
webpage = self._download_webpage( def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
url, video_id, note='Revisiting webpage') webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
self._update_visitor_cookies(video_id)
webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
return self._parse_json(webpage, video_id, fatal=fatal)
title = self._html_extract_title(webpage) def _extract_formats(self, video_info):
media_info = traverse_obj(video_info, ('page_info', 'media_info'))
video_formats = compat_parse_qs(self._search_regex( formats = traverse_obj(media_info, (
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) 'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
'url': 'url',
formats = [] 'format': ('quality_desc', {str}),
supported_resolutions = (480, 720) 'format_id': ('label', {str}),
for res in supported_resolutions: 'ext': ('mime', {mimetype2ext}),
vid_urls = video_formats.get(compat_str(res)) 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
if not vid_urls or not isinstance(vid_urls, list): 'vcodec': ('video_codecs', {str}),
continue 'fps': ('fps', {int_or_none}),
'width': ('width', {int_or_none}),
vid_url = vid_urls[0] 'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
'acodec': ('audio_codecs', {str}),
'asr': ('audio_sample_rate', {int_or_none}),
'audio_channels': ('audio_channels', {int_or_none}),
}))
if not formats: # fallback, should be barely used
for url in set(traverse_obj(media_info, (..., {url_or_none}))):
if 'label=' in url: # filter out non-video urls
format_id, resolution = self._search_regex(
r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
group=(1, 2), default=(None, None))
formats.append({ formats.append({
'url': vid_url, 'url': url,
'height': res, 'format_id': format_id,
**parse_resolution(resolution),
**traverse_obj(media_info, (
'video_details', lambda _, v: v['label'].startswith(format_id), {
'size': ('size', {int_or_none}),
'tbr': ('bitrate', {int_or_none}),
}
), get_all=False),
}) })
return formats
uploader = self._og_search_property( def _parse_video_info(self, video_info, video_id=None):
'nick-name', webpage, 'uploader', default=None)
return { return {
'id': video_id, 'id': video_id,
'title': title, 'extractor_key': WeiboIE.ie_key(),
'uploader': uploader, 'extractor': WeiboIE.IE_NAME,
'formats': formats 'formats': self._extract_formats(video_info),
'http_headers': {'Referer': 'https://weibo.com/'},
'_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
**traverse_obj(video_info, {
'id': (('id', 'id_str', 'mid'), {str_or_none}),
'display_id': ('mblogid', {str_or_none}),
'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
'description': ('text_raw', {str}),
'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
'thumbnail': ('page_info', 'page_pic', {url_or_none}),
'uploader': ('user', 'screen_name', {str}),
'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
'like_count': ('attitudes_count', {int_or_none}),
'repost_count': ('reposts_count', {int_or_none}),
}, get_all=False),
'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
} }
class WeiboMobileIE(InfoExtractor): class WeiboIE(WeiboBaseIE):
_VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?' _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
_TEST = { _TESTS = [{
'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', 'url': 'https://weibo.com/7827771738/N4xlMvjhI',
'info_dict': {
'id': '4910815147462302',
'ext': 'mp4',
'display_id': 'N4xlMvjhI',
'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
'duration': 918,
'timestamp': 1686312819,
'upload_date': '20230609',
'thumbnail': r're:https://.*\.jpg',
'uploader': '睡前视频基地',
'uploader_id': '7827771738',
'uploader_url': 'https://weibo.com/u/7827771738',
'view_count': int,
'like_count': int,
'repost_count': int,
'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
},
}, {
'url': 'https://m.weibo.cn/status/4189191225395228',
'info_dict': { 'info_dict': {
'id': '4189191225395228', 'id': '4189191225395228',
'ext': 'mp4', 'ext': 'mp4',
'title': '午睡当然是要甜甜蜜蜜的啦', 'display_id': 'FBqgOmDxO',
'uploader': '柴犬柴犬' 'title': '柴犬柴犬的秒拍视频',
} 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
'duration': 53,
'timestamp': 1514264429,
'upload_date': '20171226',
'thumbnail': r're:https://.*\.jpg',
'uploader': '柴犬柴犬',
'uploader_id': '5926682210',
'uploader_url': 'https://weibo.com/u/5926682210',
'view_count': int,
'like_count': int,
'repost_count': int,
} }
}, {
'url': 'https://weibo.com/0/4224132150961381',
'note': 'no playback_list example',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# to get Referer url for genvisitor
webpage = self._download_webpage(url, video_id, note='visit the page')
weibo_info = self._parse_json(self._search_regex( return self._parse_video_info(self._weibo_download_json(
r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
webpage, 'js_code', flags=re.DOTALL),
video_id, transform_source=js_to_json)
status_data = weibo_info.get('status', {})
page_info = status_data.get('page_info')
title = status_data['status_title']
uploader = status_data.get('user', {}).get('screen_name')
return { class WeiboVideoIE(WeiboBaseIE):
'id': video_id, _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
'title': title, _TESTS = [{
'uploader': uploader, 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
'url': page_info['media_info']['stream_url'] 'info_dict': {
'id': '4797700463137878',
'ext': 'mp4',
'display_id': 'LEZDodaiW',
'title': '稍微了解了一下靡烟miya感觉这东西也太二了',
'description': '稍微了解了一下靡烟miya感觉这东西也太二了 http://t.cn/A6aerGsM ',
'duration': 76,
'timestamp': 1659344278,
'upload_date': '20220801',
'thumbnail': r're:https://.*\.jpg',
'uploader': '君子爱财陈平安',
'uploader_id': '3905382233',
'uploader_url': 'https://weibo.com/u/3905382233',
'view_count': int,
'like_count': int,
'repost_count': int,
} }
}]
def _real_extract(self, url):
video_id = self._match_id(url)
post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
video_info = self._weibo_download_json(
f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
class WeiboUserIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://weibo.com/u/2066652961?tabtype=video',
'info_dict': {
'id': '2066652961',
'title': '萧影殿下的视频',
'description': '萧影殿下的全部视频',
'uploader': '萧影殿下',
},
'playlist_mincount': 195,
}]
def _fetch_page(self, uid, cursor=0, page=1):
return self._weibo_download_json(
'https://weibo.com/ajax/profile/getWaterFallContent',
uid, note=f'Downloading videos page {page}',
query={'uid': uid, 'cursor': cursor})['data']
def _entries(self, uid, first_page):
cursor = 0
for page in itertools.count(1):
response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
for video_info in traverse_obj(response, ('list', ..., {dict})):
yield self._parse_video_info(video_info)
cursor = response.get('next_cursor')
if (int_or_none(cursor) or -1) < 0:
break
def _real_extract(self, url):
uid = self._match_id(url)
first_page = self._fetch_page(uid)
uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
metainfo = {
'title': f'{uploader}的视频',
'description': f'{uploader}的全部视频',
'uploader': uploader,
} if uploader else {}
return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)