Compare commits

...

6 Commits

Author SHA1 Message Date
Rohan Dey
cf11b40ac4
[ie/media.ccc.de:lists] Fix extraction (#8144)
Closes #8138
Authored by: Rohxn16
2023-09-18 23:39:20 +00:00
niemands
40999467f7
[ie/pornbox] Add extractor (#7386)
Authored by: niemands
2023-09-18 23:37:17 +00:00
u-spec-png
8ac5b6d96a
[ie/N1Info:article] Fix extractor (#7373)
Authored by: u-spec-png
2023-09-18 23:36:10 +00:00
c-basalt
69b03f84f8
[ie/weibo] Fix extractor and support user extraction (#7657)
Closes #3964, Closes #4673, Closes #6979
Authored by: c-basalt
2023-09-18 23:06:36 +00:00
c-basalt
9e68747f96
[ie/bilibili] Add support for series, favorites and watch later (#7518)
Closes #6719
Authored by: c-basalt
2023-09-18 23:02:00 +00:00
Elyse
ba8e9eb2c8
[ie/radiofrance] Add support for livestreams, podcasts, playlists (#7006)
Closes #4282
Authored by: elyse0
2023-09-18 21:08:40 +00:00
7 changed files with 1030 additions and 142 deletions

View File

@ -223,7 +223,11 @@ from .bilibili import (
BiliBiliPlayerIE,
BilibiliSpaceVideoIE,
BilibiliSpaceAudioIE,
BilibiliSpacePlaylistIE,
BilibiliCollectionListIE,
BilibiliSeriesListIE,
BilibiliFavoritesListIE,
BilibiliWatchlaterIE,
BilibiliPlaylistIE,
BiliIntlIE,
BiliIntlSeriesIE,
BiliLiveIE,
@ -1501,6 +1505,7 @@ from .polskieradio import (
from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE
from .porn91 import Porn91IE
from .pornbox import PornboxIE
from .porncom import PornComIE
from .pornflip import PornFlipIE
from .pornhd import PornHdIE
@ -1555,7 +1560,14 @@ from .radiocanada import (
from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import FranceCultureIE, RadioFranceIE
from .radiofrance import (
FranceCultureIE,
RadioFranceIE,
RadioFranceLiveIE,
RadioFrancePodcastIE,
RadioFranceProfileIE,
RadioFranceProgramScheduleIE,
)
from .radiozet import RadioZetPodcastIE
from .radiokapital import (
RadioKapitalIE,
@ -2360,7 +2372,8 @@ from .webofstories import (
)
from .weibo import (
WeiboIE,
WeiboMobileIE
WeiboVideoIE,
WeiboUserIE,
)
from .weiqitv import WeiqiTVIE
from .weverse import (

View File

@ -15,6 +15,7 @@ from ..utils import (
GeoRestrictedError,
InAdvancePagedList,
OnDemandPagedList,
bool_or_none,
filter_dict,
float_or_none,
format_field,
@ -35,6 +36,7 @@ from ..utils import (
unsmuggle_url,
url_or_none,
urlencode_postdata,
variadic,
)
@ -156,7 +158,7 @@ class BilibiliBaseIE(InfoExtractor):
class BiliBiliIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.bilibili.com/video/BV13x41117TL',
@ -252,7 +254,7 @@ class BiliBiliIE(BilibiliBaseIE):
'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4',
'duration': 313.557,
'upload_date': '20220709',
'uploader': '小夫Tech',
'uploader': '小夫太渴',
'timestamp': 1657347907,
'uploader_id': '1326814124',
'comment_count': int,
@ -509,7 +511,7 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
class BiliBiliBangumiMediaIE(BilibiliBaseIE):
_VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/bangumi/media/md24097891',
'info_dict': {
@ -528,7 +530,7 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE):
class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
_VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)'
_VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ss26801',
'info_dict': {
@ -679,13 +681,35 @@ class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE):
return self.playlist_result(paged_list, playlist_id)
class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
_VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)'
class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE):
def _get_entries(self, page_data, bvid_keys, ending_key='bvid'):
for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})):
yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid)
def _get_uploader(self, uid, playlist_id):
webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False)
return self._search_regex(r'(?s)<title\b[^>]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False)
def _extract_playlist(self, fetch_page, get_metadata, get_entries):
metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries)
metadata.pop('page_count', None)
metadata.pop('page_size', None)
return metadata, page_list
class BilibiliCollectionListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail/?\?sid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445',
'info_dict': {
'id': '2142762_57445',
'title': '《底特律 变人》'
'title': '【完结】《底特律 变人》全结局流程解说',
'description': '',
'uploader': '老戴在此',
'uploader_id': '2142762',
'timestamp': int,
'upload_date': str,
'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg',
},
'playlist_mincount': 31,
}]
@ -706,22 +730,251 @@ class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE):
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'title': traverse_obj(page_data, ('meta', 'name'))
'uploader': self._get_uploader(mid, playlist_id),
**traverse_obj(page_data, {
'title': ('meta', 'name', {str}),
'description': ('meta', 'description', {str}),
'uploader_id': ('meta', 'mid', {str_or_none}),
'timestamp': ('meta', 'ptime', {int_or_none}),
'thumbnail': ('meta', 'cover', {url_or_none}),
})
}
def get_entries(page_data):
for entry in page_data.get('archives', []):
yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}',
BiliBiliIE, entry['bvid'])
return self._get_entries(page_data, 'archives')
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, metadata['title'])
return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliSeriesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/seriesdetail/?\?\bsid=(?P<sid>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0',
'info_dict': {
'id': '1958703906_547718',
'title': '直播回放',
'description': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
'modified_timestamp': int,
'modified_date': str,
},
'playlist_mincount': 513,
}]
def _real_extract(self, url):
mid, sid = self._match_valid_url(url).group('mid', 'sid')
playlist_id = f'{mid}_{sid}'
playlist_meta = traverse_obj(self._download_json(
f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False
), {
'title': ('data', 'meta', 'name', {str}),
'description': ('data', 'meta', 'description', {str}),
'uploader_id': ('data', 'meta', 'mid', {str_or_none}),
'timestamp': ('data', 'meta', 'ctime', {int_or_none}),
'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}),
})
def fetch_page(page_idx):
return self._download_json(
'https://api.bilibili.com/x/series/archives',
playlist_id, note=f'Downloading page {page_idx}',
query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data']
def get_metadata(page_data):
page_size = page_data['page']['size']
entry_count = page_data['page']['total']
return {
'page_count': math.ceil(entry_count / page_size),
'page_size': page_size,
'uploader': self._get_uploader(mid, playlist_id),
**playlist_meta
}
def get_entries(page_data):
return self._get_entries(page_data, 'archives')
metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries)
return self.playlist_result(paged_list, playlist_id, **metadata)
class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P<id>\d+)'
_TESTS = [{
'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create',
'info_dict': {
'id': '1103407912',
'title': '【V2】',
'description': '',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'modified_timestamp': int,
'modified_date': str,
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
'view_count': int,
'like_count': int,
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/detail/ml1103407912',
'only_matching': True,
}]
def _real_extract(self, url):
fid = self._match_id(url)
list_info = self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20',
fid, note='Downloading favlist metadata')
if list_info['code'] == -403:
self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner')
entries = self._get_entries(self._download_json(
f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}',
fid, note='Download favlist entries'), 'data')
return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', {
'title': ('title', {str}),
'description': ('intro', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'modified_timestamp': ('mtime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
'view_count': ('cnt_info', 'play', {int_or_none}),
'like_count': ('cnt_info', 'thumb_up', {int_or_none}),
})))
class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)'
_TESTS = [{
'url': 'https://www.bilibili.com/watchlater/#/list',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _real_extract(self, url):
list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater')
watchlater_info = self._download_json(
'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id)
if watchlater_info['code'] == -101:
self.raise_login_required(msg='You need to login to access your watchlater list')
entries = self._get_entries(watchlater_info, ('data', 'list'))
return self.playlist_result(entries, id=list_id, title='稍后再看')
class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P<id>\w+)'
_TESTS = [{
'url': 'https://www.bilibili.com/list/1958703906?sid=547718',
'info_dict': {
'id': '5_547718',
'title': '直播回放',
'uploader': '靡烟miya',
'uploader_id': '1958703906',
'timestamp': 1637985853,
'upload_date': '20211127',
},
'playlist_mincount': 513,
}, {
'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1',
'info_dict': {
'id': '5_547718',
},
'playlist_mincount': 513,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/ml1103407912',
'info_dict': {
'id': '3_1103407912',
'title': '【V2】',
'uploader': '晓月春日',
'uploader_id': '84912',
'timestamp': 1604905176,
'upload_date': '20201109',
'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg",
},
'playlist_mincount': 22,
}, {
'url': 'https://www.bilibili.com/medialist/play/ml1103407912',
'info_dict': {
'id': '3_1103407912',
},
'playlist_mincount': 22,
'skip': 'redirect url',
}, {
'url': 'https://www.bilibili.com/list/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}, {
'url': 'https://www.bilibili.com/medialist/play/watchlater',
'info_dict': {'id': 'watchlater'},
'playlist_mincount': 0,
'skip': 'login required',
}]
def _extract_medialist(self, query, list_id):
for page_num in itertools.count(1):
page_data = self._download_json(
'https://api.bilibili.com/x/v2/medialist/resource/list',
list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}'
)['data']
yield from self._get_entries(page_data, 'media_list', ending_key='bv_id')
query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id'))
if not page_data.get('has_more', False):
break
def _real_extract(self, url):
list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id)
if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200:
error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none}))
error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none}))
if error_code == -400 and list_id == 'watchlater':
self.raise_login_required('You need to login to access your watchlater playlist')
elif error_code == -403:
self.raise_login_required('This is a private playlist. You need to login as its owner')
elif error_code == 11010:
raise ExtractorError('Playlist is no longer available', expected=True)
raise ExtractorError(f'Could not access playlist: {error_code} {error_message}')
query = {
'ps': 20,
'with_current': False,
**traverse_obj(initial_state, {
'type': ('playlist', 'type', {int_or_none}),
'biz_id': ('playlist', 'id', {int_or_none}),
'tid': ('tid', {int_or_none}),
'sort_field': ('sortFiled', {int_or_none}),
'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}),
})
}
metadata = {
'id': f'{query["type"]}_{query["biz_id"]}',
**traverse_obj(initial_state, ('mediaListInfo', {
'title': ('title', {str}),
'uploader': ('upper', 'name', {str}),
'uploader_id': ('upper', 'mid', {str_or_none}),
'timestamp': ('ctime', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
})),
}
return self.playlist_result(self._extract_medialist(query, list_id), **metadata)
class BilibiliCategoryIE(InfoExtractor):
IE_NAME = 'Bilibili category extractor'
_MAX_RESULTS = 1000000
_VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+'
_TESTS = [{
'url': 'https://www.bilibili.com/v/kichiku/mad',
'info_dict': {
@ -1406,7 +1659,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
class BiliLiveIE(InfoExtractor):
_VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)'
_VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P<id>\d+)'
_TESTS = [{
'url': 'https://live.bilibili.com/196',

View File

@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor):
'id': '30c3',
},
'playlist_count': 135,
}, {
'url': 'https://media.ccc.de/c/DS2023',
'info_dict': {
'title': 'Datenspuren 2023',
'id': 'DS2023',
},
'playlist_count': 37
}]
def _real_extract(self, url):
playlist_id = self._match_id(url).lower()
playlist_id = self._match_id(url)
conf = self._download_json(
'https://media.ccc.de/public/conferences/' + playlist_id,

View File

@ -33,7 +33,7 @@ class N1InfoAssetIE(InfoExtractor):
class N1InfoIIE(InfoExtractor):
IE_NAME = 'N1Info:article'
_VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)'
_VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)'
_TESTS = [{
# Youtube embedded
'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/',
@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor):
'upload_date': '20211102',
'timestamp': 1635861677,
},
}, {
'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/',
'info_dict': {
'id': '1332368',
'ext': 'mp4',
'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama',
'upload_date': '20230620',
'timestamp': 1687290536,
'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg'
},
}, {
'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/',
'only_matching': True,
@ -105,19 +115,35 @@ class N1InfoIIE(InfoExtractor):
title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title')
timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage))
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
plugin_data = self._html_search_meta('BridPlugin', webpage)
entries = []
for video in videos:
video_data = extract_attributes(video)
entries.append({
'_type': 'url_transparent',
'url': video_data.get('data-url'),
'id': video_data.get('id'),
'title': title,
'thumbnail': video_data.get('data-thumbnail'),
'timestamp': timestamp,
'ie_key': 'N1InfoAsset'})
if plugin_data:
site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id')
for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage):
video_id = self._parse_json(video_data, title)['video']
entries.append({
'id': video_id,
'title': title,
'timestamp': timestamp,
'thumbnail': self._html_search_meta('thumbnailURL', webpage),
'formats': self._extract_m3u8_formats(
f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8',
video_id, fatal=False),
})
else:
# Old player still present in older articles
videos = re.findall(r'(?m)(<video[^>]+>)', webpage)
for video in videos:
video_data = extract_attributes(video)
entries.append({
'_type': 'url_transparent',
'url': video_data.get('data-url'),
'id': video_data.get('id'),
'title': title,
'thumbnail': video_data.get('data-thumbnail'),
'timestamp': timestamp,
'ie_key': 'N1InfoAsset',
})
embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage)
for embedded_video in embedded_videos:

113
yt_dlp/extractor/pornbox.py Normal file
View File

@ -0,0 +1,113 @@
from .common import InfoExtractor
from ..compat import functools
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
qualities,
str_or_none,
traverse_obj,
url_or_none,
)
class PornboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://pornbox.com/application/watch-page/212108',
'md5': '3ff6b6e206f263be4c5e987a3162ac6e',
'info_dict': {
'id': '212108',
'ext': 'mp4',
'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49',
'uploader': 'Lily Strong',
'timestamp': 1665871200,
'upload_date': '20221015',
'age_limit': 18,
'availability': 'needs_auth',
'duration': 1505,
'cast': ['Lily Strong', 'John Strong'],
'tags': 'count:11',
'description': 'md5:589c7f33e183aa8aa939537300efb859',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$'
}
}, {
'url': 'https://pornbox.com/application/watch-page/216045',
'info_dict': {
'id': '216045',
'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2',
'description': 'md5:3e631dcaac029f15ed434e402d1b06c7',
'uploader': 'VK Studio',
'timestamp': 1618264800,
'upload_date': '20210412',
'age_limit': 18,
'availability': 'premium_only',
'duration': 2710,
'cast': 'count:3',
'tags': 'count:29',
'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$',
'subtitles': 'count:6'
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True
},
'expected_warnings': [
'You are either not logged in or do not have access to this scene',
'No video formats found', 'Requested format is not available']
}]
def _real_extract(self, url):
video_id = self._match_id(url)
public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id)
subtitles = {country_code: [{
'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}',
'ext': 'srt'
}] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))}
is_free_scene = traverse_obj(
public_data, ('price', 'is_available_for_free', {bool}), default=False)
metadata = {
'id': video_id,
**traverse_obj(public_data, {
'title': ('scene_name', {str.strip}),
'description': ('small_description', {str.strip}),
'uploader': 'studio',
'duration': ('runtime', {parse_duration}),
'cast': (('models', 'male_models'), ..., 'model_name'),
'thumbnail': ('player_poster', {url_or_none}),
'tags': ('niches', ..., 'niche'),
}),
'age_limit': 18,
'timestamp': parse_iso8601(traverse_obj(
public_data, ('studios', 'release_date'), 'publish_date')),
'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene),
'subtitles': subtitles,
}
if not public_data.get('is_purchased') or not is_free_scene:
self.raise_login_required(
'You are either not logged in or do not have access to this scene', metadata_available=True)
return metadata
media_id = traverse_obj(public_data, (
'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False)
if not media_id:
self.raise_no_formats('Could not find stream id', video_id=video_id)
stream_data = self._download_json(
f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls')
get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k'])
metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], {
'url': 'src',
'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
'format_id': ('quality', {str_or_none}),
'quality': ('quality', {get_quality}),
'width': ('size', {lambda x: int(x[:-1])}),
}))
return metadata

View File

@ -1,7 +1,18 @@
import itertools
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import parse_duration, unified_strdate
from ..utils import (
int_or_none,
join_nonempty,
js_to_json,
parse_duration,
strftime_or_none,
traverse_obj,
unified_strdate,
urljoin,
)
class RadioFranceIE(InfoExtractor):
@ -56,8 +67,32 @@ class RadioFranceIE(InfoExtractor):
}
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])'
class RadioFranceBaseIE(InfoExtractor):
_VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
_STATIONS_RE = '|'.join(map(re.escape, (
'franceculture',
'franceinfo',
'franceinter',
'francemusique',
'fip',
'mouv',
)))
def _extract_data_from_webpage(self, webpage, display_id, key):
return traverse_obj(self._search_json(
r'\bconst\s+data\s*=', webpage, key, display_id,
contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json),
(..., 'data', key, {dict}), get_all=False) or {}
class FranceCultureIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
'''
_TESTS = [
{
'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor):
'ext': 'mp3',
'title': 'La physique dEinstein aiderait-elle à comprendre le cerveau ?',
'description': 'Existerait-il un pont conceptuel entre la physique de lespace-temps et les neurosciences ?',
'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20220514',
'duration': 2750,
},
},
{
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
'info_dict': {
'id': '2107675',
'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
'description': 'md5:36ee74351ede77a314fdebb94026b916',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'upload_date': '20230310',
'duration': 8977,
'ext': 'mp3',
},
},
{
'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
'only_matching': True,
}
]
@ -89,7 +140,6 @@ class FranceCultureIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'url': video_data['contentUrl'],
'ext': video_data.get('encodingFormat'),
'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
'duration': parse_duration(video_data.get('duration')),
'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
@ -102,3 +152,322 @@ class FranceCultureIE(InfoExtractor):
'upload_date': unified_strdate(self._search_regex(
r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False))
}
class RadioFranceLiveIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
https?://(?:www\.)?radiofrance\.fr
/(?P<id>{RadioFranceBaseIE._STATIONS_RE})
/?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/',
'info_dict': {
'id': 'franceinter',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/franceculture',
'info_dict': {
'id': 'franceculture',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
'info_dict': {
'id': 'mouv-radio-musique-kids-family',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
'info_dict': {
'id': 'mouv-radio-rnb-soul',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
'info_dict': {
'id': 'mouv-radio-musique-mix',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/fip/radio-rock',
'info_dict': {
'id': 'fip-radio-rock',
'title': str,
'live_status': 'is_live',
'ext': 'aac',
},
'params': {
'skip_download': 'Livestream',
},
}, {
'url': 'https://www.radiofrance.fr/mouv',
'only_matching': True,
}]
def _real_extract(self, url):
station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
if substation_id:
webpage = self._download_webpage(url, station_id)
api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
else:
api_response = self._download_json(
f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
formats, subtitles = [], {}
for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
if media_source.get('format') == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': media_source['url'],
'abr': media_source.get('bitrate'),
})
return {
'id': join_nonempty(station_id, substation_id),
'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
'formats': formats,
'subtitles': subtitles,
'is_live': True,
}
class RadioFrancePlaylistBase(RadioFranceBaseIE):
"""Subclasses must set _METADATA_KEY"""
def _call_api(self, content_id, cursor, page_num):
raise NotImplementedError('This method must be implemented by subclasses')
def _generate_playlist_entries(self, content_id, content_response):
for page_num in itertools.count(2):
for entry in content_response['items']:
yield self.url_result(
f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
'title': 'title',
'description': 'standFirst',
'timestamp': ('publishedDate', {int_or_none}),
'thumbnail': ('visual', 'src'),
}))
next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
if not next_cursor:
break
content_response = self._call_api(content_id, next_cursor, page_num)
def _real_extract(self, url):
display_id = self._match_id(url)
metadata = self._download_json(
'https://www.radiofrance.fr/api/v2.1/path', display_id,
query={'value': urllib.parse.urlparse(url).path})['content']
content_id = metadata['id']
return self.playlist_result(
self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
display_id=display_id, **{**traverse_obj(metadata, {
'title': 'title',
'description': 'standFirst',
'thumbnail': ('visual', 'src'),
}), **traverse_obj(metadata, {
'title': 'name',
'description': 'role',
})})
class RadioFrancePodcastIE(RadioFrancePlaylistBase):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?:{RadioFranceBaseIE._STATIONS_RE})
/podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
'info_dict': {
'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
'display_id': 'le-billet-vert',
'title': 'Le billet sciences',
'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 11,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
'info_dict': {
'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
'display_id': 'jean-marie-le-pen-l-obsession-nationale',
'title': 'Jean-Marie Le Pen, l\'obsession nationale',
'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_count': 7,
}, {
'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
'info_dict': {
'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
'display_id': 'serie-thomas-grjebine',
'title': 'Thomas Grjebine',
},
'playlist_count': 1,
}, {
'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
'info_dict': {
'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
'display_id': 'certains-l-aiment-fip',
'title': 'Certains laiment Fip',
'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 321,
}, {
'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
'only_matching': True,
}, {
'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
'only_matching': True,
}]
_METADATA_KEY = 'expressions'
def _call_api(self, podcast_id, cursor, page_num):
return self._download_json(
f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
note=f'Downloading page {page_num}', query={'pageCursor': cursor})
class RadioFranceProfileIE(RadioFrancePlaylistBase):
_VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
'info_dict': {
'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
'display_id': 'thomas-pesquet',
'title': 'Thomas Pesquet',
'description': 'Astronaute à l\'agence spatiale européenne',
},
'playlist_mincount': 212,
}, {
'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
'info_dict': {
'id': '9593050b-0183-4972-a0b5-d8f699079e02',
'display_id': 'eugenie-bastie',
'title': 'Eugénie Bastié',
'description': 'Journaliste et essayiste',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
},
'playlist_mincount': 39,
}, {
'url': 'https://www.radiofrance.fr/personnes/lea-salame',
'only_matching': True,
}]
_METADATA_KEY = 'documents'
def _call_api(self, profile_id, cursor, page_num):
resp = self._download_json(
f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
note=f'Downloading page {page_num}', query={
'relation': 'personality',
'cursor': cursor,
})
resp['next'] = traverse_obj(resp, ('pagination', 'next'))
return resp
class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
_VALID_URL = rf'''(?x)
{RadioFranceBaseIE._VALID_URL_BASE}
/(?P<station>{RadioFranceBaseIE._STATIONS_RE})
/grille-programmes(?:\?date=(?P<date>[\d-]+))?
'''
_TESTS = [{
'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
'info_dict': {
'id': 'franceinter-program-20230217',
'upload_date': '20230217',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
'info_dict': {
'id': 'franceculture-program-20230201',
'upload_date': '20230201',
},
'playlist_count': 25,
}, {
'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
'info_dict': {
'id': 'mouv-program-20230319',
'upload_date': '20230319',
},
'playlist_count': 3,
}, {
'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
'info_dict': {
'id': 'francemusique-program-20230318',
'upload_date': '20230318',
},
'playlist_count': 15,
}, {
'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
'only_matching': True,
}]
def _generate_playlist_entries(self, webpage_url, api_response):
for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
yield self.url_result(
urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
url_transparent=True, **traverse_obj(entry, {
'title': ('expression', 'title'),
'thumbnail': ('expression', 'visual', 'src'),
'timestamp': ('startTime', {int_or_none}),
'series_id': ('concept', 'id'),
'series': ('concept', 'title'),
}))
def _real_extract(self, url):
station, date = self._match_valid_url(url).group('station', 'date')
webpage = self._download_webpage(url, station)
grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
return self.playlist_result(
self._generate_playlist_entries(url, grid_data),
join_nonempty(station, 'program', upload_date), upload_date=upload_date)

View File

@ -1,134 +1,241 @@
from .common import InfoExtractor
import json
import random
import re
import itertools
import urllib.parse
from ..compat import (
compat_parse_qs,
compat_str,
)
from .common import InfoExtractor
from ..utils import (
js_to_json,
int_or_none,
make_archive_id,
mimetype2ext,
parse_resolution,
str_or_none,
strip_jsonp,
traverse_obj,
url_or_none,
urlencode_postdata,
urljoin,
)
class WeiboIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment',
'info_dict': {
'id': 'Fp6RGfbff',
'ext': 'mp4',
'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博',
}
}
class WeiboBaseIE(InfoExtractor):
def _update_visitor_cookies(self, video_id):
visitor_data = self._download_json(
'https://passport.weibo.com/visitor/genvisitor', video_id,
note='Generating first-visit guest request',
transform_source=strip_jsonp,
data=urlencode_postdata({
'cb': 'gen_callback',
'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}',
}))
def _real_extract(self, url):
video_id = self._match_id(url)
# to get Referer url for genvisitor
webpage, urlh = self._download_webpage_handle(url, video_id)
visitor_url = urlh.url
if 'passport.weibo.com' in visitor_url:
# first visit
visitor_data = self._download_json(
'https://passport.weibo.com/visitor/genvisitor', video_id,
note='Generating first-visit data',
transform_source=strip_jsonp,
headers={'Referer': visitor_url},
data=urlencode_postdata({
'cb': 'gen_callback',
'fp': json.dumps({
'os': '2',
'browser': 'Gecko57,0,0,0',
'fonts': 'undefined',
'screenInfo': '1440*900*24',
'plugins': '',
}),
}))
tid = visitor_data['data']['tid']
cnfd = '%03d' % visitor_data['data']['confidence']
self._download_webpage(
'https://passport.weibo.com/visitor/visitor', video_id,
note='Running first-visit callback',
query={
'a': 'incarnate',
't': tid,
'w': 2,
'c': cnfd,
'cb': 'cross_domain',
'from': 'weibo',
'_rand': random.random(),
})
webpage = self._download_webpage(
url, video_id, note='Revisiting webpage')
title = self._html_extract_title(webpage)
video_formats = compat_parse_qs(self._search_regex(
r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
formats = []
supported_resolutions = (480, 720)
for res in supported_resolutions:
vid_urls = video_formats.get(compat_str(res))
if not vid_urls or not isinstance(vid_urls, list):
continue
vid_url = vid_urls[0]
formats.append({
'url': vid_url,
'height': res,
self._download_webpage(
'https://passport.weibo.com/visitor/visitor', video_id,
note='Running first-visit callback to get guest cookies',
query={
'a': 'incarnate',
't': visitor_data['data']['tid'],
'w': 2,
'c': '%03d' % visitor_data['data']['confidence'],
'cb': 'cross_domain',
'from': 'weibo',
'_rand': random.random(),
})
uploader = self._og_search_property(
'nick-name', webpage, 'uploader', default=None)
def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
self._update_visitor_cookies(video_id)
webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
return self._parse_json(webpage, video_id, fatal=fatal)
def _extract_formats(self, video_info):
media_info = traverse_obj(video_info, ('page_info', 'media_info'))
formats = traverse_obj(media_info, (
'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
'url': 'url',
'format': ('quality_desc', {str}),
'format_id': ('label', {str}),
'ext': ('mime', {mimetype2ext}),
'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
'vcodec': ('video_codecs', {str}),
'fps': ('fps', {int_or_none}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'filesize': ('size', {int_or_none}),
'acodec': ('audio_codecs', {str}),
'asr': ('audio_sample_rate', {int_or_none}),
'audio_channels': ('audio_channels', {int_or_none}),
}))
if not formats: # fallback, should be barely used
for url in set(traverse_obj(media_info, (..., {url_or_none}))):
if 'label=' in url: # filter out non-video urls
format_id, resolution = self._search_regex(
r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
group=(1, 2), default=(None, None))
formats.append({
'url': url,
'format_id': format_id,
**parse_resolution(resolution),
**traverse_obj(media_info, (
'video_details', lambda _, v: v['label'].startswith(format_id), {
'size': ('size', {int_or_none}),
'tbr': ('bitrate', {int_or_none}),
}
), get_all=False),
})
return formats
def _parse_video_info(self, video_info, video_id=None):
return {
'id': video_id,
'title': title,
'uploader': uploader,
'formats': formats
'extractor_key': WeiboIE.ie_key(),
'extractor': WeiboIE.IE_NAME,
'formats': self._extract_formats(video_info),
'http_headers': {'Referer': 'https://weibo.com/'},
'_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
**traverse_obj(video_info, {
'id': (('id', 'id_str', 'mid'), {str_or_none}),
'display_id': ('mblogid', {str_or_none}),
'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
'description': ('text_raw', {str}),
'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
'thumbnail': ('page_info', 'page_pic', {url_or_none}),
'uploader': ('user', 'screen_name', {str}),
'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
'like_count': ('attitudes_count', {int_or_none}),
'repost_count': ('reposts_count', {int_or_none}),
}, get_all=False),
'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
}
class WeiboMobileIE(InfoExtractor):
_VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?'
_TEST = {
'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0',
class WeiboIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://weibo.com/7827771738/N4xlMvjhI',
'info_dict': {
'id': '4910815147462302',
'ext': 'mp4',
'display_id': 'N4xlMvjhI',
'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
'duration': 918,
'timestamp': 1686312819,
'upload_date': '20230609',
'thumbnail': r're:https://.*\.jpg',
'uploader': '睡前视频基地',
'uploader_id': '7827771738',
'uploader_url': 'https://weibo.com/u/7827771738',
'view_count': int,
'like_count': int,
'repost_count': int,
'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
},
}, {
'url': 'https://m.weibo.cn/status/4189191225395228',
'info_dict': {
'id': '4189191225395228',
'ext': 'mp4',
'title': '午睡当然是要甜甜蜜蜜的啦',
'uploader': '柴犬柴犬'
'display_id': 'FBqgOmDxO',
'title': '柴犬柴犬的秒拍视频',
'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
'duration': 53,
'timestamp': 1514264429,
'upload_date': '20171226',
'thumbnail': r're:https://.*\.jpg',
'uploader': '柴犬柴犬',
'uploader_id': '5926682210',
'uploader_url': 'https://weibo.com/u/5926682210',
'view_count': int,
'like_count': int,
'repost_count': int,
}
}
}, {
'url': 'https://weibo.com/0/4224132150961381',
'note': 'no playback_list example',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
# to get Referer url for genvisitor
webpage = self._download_webpage(url, video_id, note='visit the page')
weibo_info = self._parse_json(self._search_regex(
r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};',
webpage, 'js_code', flags=re.DOTALL),
video_id, transform_source=js_to_json)
return self._parse_video_info(self._weibo_download_json(
f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
status_data = weibo_info.get('status', {})
page_info = status_data.get('page_info')
title = status_data['status_title']
uploader = status_data.get('user', {}).get('screen_name')
return {
'id': video_id,
'title': title,
'uploader': uploader,
'url': page_info['media_info']['stream_url']
class WeiboVideoIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
_TESTS = [{
'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
'info_dict': {
'id': '4797700463137878',
'ext': 'mp4',
'display_id': 'LEZDodaiW',
'title': '稍微了解了一下靡烟miya感觉这东西也太二了',
'description': '稍微了解了一下靡烟miya感觉这东西也太二了 http://t.cn/A6aerGsM ',
'duration': 76,
'timestamp': 1659344278,
'upload_date': '20220801',
'thumbnail': r're:https://.*\.jpg',
'uploader': '君子爱财陈平安',
'uploader_id': '3905382233',
'uploader_url': 'https://weibo.com/u/3905382233',
'view_count': int,
'like_count': int,
'repost_count': int,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
video_info = self._weibo_download_json(
f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
class WeiboUserIE(WeiboBaseIE):
_VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://weibo.com/u/2066652961?tabtype=video',
'info_dict': {
'id': '2066652961',
'title': '萧影殿下的视频',
'description': '萧影殿下的全部视频',
'uploader': '萧影殿下',
},
'playlist_mincount': 195,
}]
def _fetch_page(self, uid, cursor=0, page=1):
return self._weibo_download_json(
'https://weibo.com/ajax/profile/getWaterFallContent',
uid, note=f'Downloading videos page {page}',
query={'uid': uid, 'cursor': cursor})['data']
def _entries(self, uid, first_page):
cursor = 0
for page in itertools.count(1):
response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
for video_info in traverse_obj(response, ('list', ..., {dict})):
yield self._parse_video_info(video_info)
cursor = response.get('next_cursor')
if (int_or_none(cursor) or -1) < 0:
break
def _real_extract(self, url):
uid = self._match_id(url)
first_page = self._fetch_page(uid)
uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
metainfo = {
'title': f'{uploader}的视频',
'description': f'{uploader}的全部视频',
'uploader': uploader,
} if uploader else {}
return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)