Compare commits

...

12 Commits

Author SHA1 Message Date
hafeoz 0cf2beba03
Merge 8db4101fbe into 6b54cccdcb 2024-05-09 23:21:45 +02:00
Alexandre Huot 6b54cccdcb
[ie/Qub] Fix extractor (#7019)
Closes #4989
Authored by: alexhuot1, dirkf
2024-05-08 22:10:06 +00:00
src-tinkerer c4b87dd885
[ie/ZenYandex] Fix extractor (#9813)
Closes #9803
Authored by: src-tinkerer
2024-05-08 21:27:30 +00:00
fireattack 2338827072
[ie/bilibili] Fix `--geo-verification-proxy` support (#9817)
Closes #9797
Authored by: fireattack
2024-05-08 21:24:44 +00:00
fireattack 06d52c8731
[ie/BilibiliSpaceVideo] Better error message (#9839)
Closes #9528
Authored by: fireattack
2024-05-08 21:09:38 +00:00
sepro df5c9e733a
[ie/vk] Improve format extraction (#9885)
Closes #5675
Authored by: seproDev
2024-05-08 23:02:22 +02:00
Mozi b38018b781
[ie/mixch] Extract comments (#9860)
Authored by: pzhlkj6612
2024-05-08 20:51:16 +00:00
Rasmus Antons 145dc6f656
[ie/boosty] Add cookies support (#9522)
Closes #9401
Authored by: RasmusAntons
2024-05-08 20:16:32 +00:00
bashonly 5904853ae5
[ie/crunchyroll] Support browser impersonation (#9857)
Closes #7442
Authored by: bashonly
2024-05-05 23:15:32 +00:00
Chris Caruso c8bf48f3a8
[ie/cbc.ca:player] Improve `_VALID_URL` (#9866)
Closes #9825
Authored by: carusocr
2024-05-05 23:02:24 +00:00
The-MAGI 351368cb9a
[ie/youporn] Fix extractor (#8827)
Closes #7967
Authored by: The-MAGI
2024-05-05 22:57:38 +00:00
hafeoz 8db4101fbe
[ie/neteasemusic] Add additional quality formats
Taken from song info https://music.163.com/#/song?id=1952853555

Cross-checked with 43078108be/pycloudmusic/object/music163.py (L352)
2024-02-12 19:15:45 +00:00
10 changed files with 151 additions and 63 deletions

View File

@ -93,11 +93,11 @@ def extract_formats(self, play_info):
return formats
def _download_playinfo(self, video_id, cid):
def _download_playinfo(self, video_id, cid, headers=None):
return self._download_json(
'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
note=f'Downloading video formats for cid {cid}')['data']
note=f'Downloading video formats for cid {cid}', headers=headers)['data']
def json2srt(self, json_data):
srt_data = ''
@ -493,7 +493,8 @@ class BiliBiliIE(BilibiliBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, video_id)
headers = self.geo_verification_headers()
webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers)
if not self._match_valid_url(urlh.url):
return self.url_result(urlh.url)
@ -531,7 +532,7 @@ def _real_extract(self, url):
self._download_json(
'https://api.bilibili.com/x/player/pagelist', video_id,
fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'},
note='Extracting videos in anthology'),
note='Extracting videos in anthology', headers=headers),
'data', expected_type=list) or []
is_anthology = len(page_list_json) > 1
@ -552,7 +553,7 @@ def _real_extract(self, url):
festival_info = {}
if is_festival:
play_info = self._download_playinfo(video_id, cid)
play_info = self._download_playinfo(video_id, cid, headers=headers)
festival_info = traverse_obj(initial_state, {
'uploader': ('videoInfo', 'upName'),
@ -666,14 +667,15 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
def _real_extract(self, url):
episode_id = self._match_id(url)
webpage = self._download_webpage(url, episode_id)
headers = self.geo_verification_headers()
webpage = self._download_webpage(url, episode_id, headers=headers)
if '您所在的地区无法观看本片' in webpage:
raise GeoRestrictedError('This video is restricted')
elif '正在观看预览,大会员免费看全片' in webpage:
self.raise_login_required('This video is for premium members only')
headers = {'Referer': url, **self.geo_verification_headers()}
headers['Referer'] = url
play_info = self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
@ -724,7 +726,7 @@ def _real_extract(self, url):
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid),
'__post_extractor': self.extract_comments(aid),
'http_headers': headers,
'http_headers': {'Referer': url},
}
@ -1049,9 +1051,10 @@ def fetch_page(page_idx):
raise ExtractorError(
'Request is blocked by server (412), please add cookies, wait and try later.', expected=True)
raise
if response['code'] == -401:
if response['code'] in (-352, -401):
raise ExtractorError(
'Request is blocked by server (401), please add cookies, wait and try later.', expected=True)
f'Request is blocked by server ({-response["code"]}), '
'please add cookies, wait and try later.', expected=True)
return response['data']
def get_metadata(page_data):

View File

@ -1,7 +1,11 @@
import json
import urllib.parse
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..utils import (
ExtractorError,
bug_reports_message,
int_or_none,
qualities,
str_or_none,
@ -162,9 +166,19 @@ def _extract_formats(self, player_urls, video_id):
def _real_extract(self, url):
user, post_id = self._match_valid_url(url).group('user', 'post_id')
auth_headers = {}
auth_cookie = self._get_cookies('https://boosty.to/').get('auth')
if auth_cookie is not None:
try:
auth_data = json.loads(urllib.parse.unquote(auth_cookie.value))
auth_headers['Authorization'] = f'Bearer {auth_data["accessToken"]}'
except (json.JSONDecodeError, KeyError):
self.report_warning(f'Failed to extract token from auth cookie{bug_reports_message()}')
post = self._download_json(
f'https://api.boosty.to/v1/blog/{user}/post/{post_id}', post_id,
note='Downloading post data', errnote='Unable to download post data')
note='Downloading post data', errnote='Unable to download post data', headers=auth_headers)
post_title = post.get('title')
if not post_title:
@ -202,7 +216,9 @@ def _real_extract(self, url):
'thumbnail': (('previewUrl', 'defaultPreview'), {url_or_none}),
}, get_all=False)})
if not entries:
if not entries and not post.get('hasAccess'):
self.raise_login_required('This post requires a subscription', metadata_available=True)
elif not entries:
raise ExtractorError('No videos found', expected=True)
if len(entries) == 1:
return entries[0]

View File

@ -151,7 +151,7 @@ def _real_extract(self, url):
class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c',
@ -277,6 +277,28 @@ class CBCPlayerIE(InfoExtractor):
'location': 'Canada',
'media_type': 'Full Program',
},
}, {
'url': 'https://www.cbc.ca/player/play/video/1.7194274',
'md5': '188b96cf6bdcb2540e178a6caa957128',
'info_dict': {
'id': '2334524995812',
'ext': 'mp4',
'title': '#TheMoment a rare white spirit moose was spotted in Alberta',
'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3',
'timestamp': 1714788791,
'duration': 77.678,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg',
'uploader': 'CBCC-NEW',
'chapters': 'count:0',
'upload_date': '20240504',
'categories': 'count:3',
'series': 'The National',
'tags': 'count:15',
'creators': ['encoder'],
'location': 'Canada',
'media_type': 'Excerpt',
},
}, {
'url': 'cbcplayer:1.7159484',
'only_matching': True,

View File

@ -53,15 +53,19 @@ def _set_auth_info(self, response):
CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10)
def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'):
try: # TODO: Add impersonation support here
try:
return self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote,
headers=headers, data=urlencode_postdata(data))
headers=headers, data=urlencode_postdata(data), impersonate=True)
except ExtractorError as error:
if not isinstance(error.cause, HTTPError) or error.cause.status != 403:
raise
if target := error.cause.response.extensions.get('impersonate'):
raise ExtractorError(f'Got HTTP Error 403 when using impersonate target "{target}"')
raise ExtractorError(
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
'Request blocked by Cloudflare. '
'Install the required impersonation dependency if possible, '
'or else navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)

View File

@ -1,6 +1,12 @@
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none
from ..utils import (
ExtractorError,
UserNotLive,
int_or_none,
str_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
@ -9,17 +15,20 @@ class MixchIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://mixch.tv/u/16236849/live',
'url': 'https://mixch.tv/u/16943797/live',
'skip': 'don\'t know if this live persists',
'info_dict': {
'id': '16236849',
'title': '24配信シェア⭕投票🙏💦',
'comment_count': 13145,
'view_count': 28348,
'timestamp': 1636189377,
'uploader': '🦥伊咲👶🏻#フレアワ',
'uploader_id': '16236849',
}
'id': '16943797',
'ext': 'mp4',
'title': '#EntView #カリナ #セブチ 2024-05-05 06:58',
'comment_count': int,
'view_count': int,
'timestamp': 1714726805,
'uploader': 'Ent.View K-news🎶💕',
'uploader_id': '16943797',
'live_status': 'is_live',
'upload_date': '20240503',
},
}, {
'url': 'https://mixch.tv/u/16137876/live',
'only_matching': True,
@ -48,8 +57,20 @@ def _real_extract(self, url):
'protocol': 'm3u8',
}],
'is_live': True,
'__post_extractor': self.extract_comments(video_id),
}
def _get_comments(self, video_id):
yield from traverse_obj(self._download_json(
f'https://mixch.tv/api-web/lives/{video_id}/messages', video_id,
note='Downloading comments', errnote='Failed to download comments'), (..., {
'author': ('name', {str}),
'author_id': ('user_id', {str_or_none}),
'id': ('message_id', {str}, {lambda x: x or None}),
'text': ('body', {str}),
'timestamp': ('created', {int}),
}))
class MixchArchiveIE(InfoExtractor):
IE_NAME = 'mixch:archive'

View File

@ -22,7 +22,7 @@
class NetEaseMusicBaseIE(InfoExtractor):
_FORMATS = ['bMusic', 'mMusic', 'hMusic']
_FORMATS = ['sqMusic', 'hrMusic', 'lMusic', 'bMusic', 'mMusic', 'hMusic']
_API_BASE = 'http://music.163.com/api/'
_GEO_BYPASS = False

View File

@ -1,10 +1,9 @@
import functools
import re
from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
smuggle_url,
strip_or_none,
)
from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none
from ..utils.traversal import traverse_obj
class TVAIE(InfoExtractor):
@ -49,11 +48,20 @@ class QubIE(InfoExtractor):
'info_dict': {
'id': '6084352463001',
'ext': 'mp4',
'title': 'Épisode 01',
'title': 'Ép 01. Mon dernier jour',
'uploader_id': '5481942443001',
'upload_date': '20190907',
'timestamp': 1567899756,
'description': 'md5:9c0d7fbb90939420c651fd977df90145',
'thumbnail': r're:https://.+\.jpg',
'episode': 'Ép 01. Mon dernier jour',
'episode_number': 1,
'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'],
'duration': 2625.963,
'season': 'Season 1',
'season_number': 1,
'series': 'Alerte Amber',
'channel': 'TVA',
},
}, {
'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943',
@ -64,22 +72,24 @@ class QubIE(InfoExtractor):
def _real_extract(self, url):
entity_id = self._match_id(url)
entity = self._download_json(
'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities',
entity_id, query={'id': entity_id})
webpage = self._download_webpage(url, entity_id)
entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData']
video_id = entity['videoId']
episode = strip_or_none(entity.get('name'))
return {
'_type': 'url_transparent',
'url': f'https://videos.tva.ca/details/_{video_id}',
'ie_key': TVAIE.ie_key(),
'id': video_id,
'title': episode,
# 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'],
'url': 'https://videos.tva.ca/details/_' + video_id,
'description': entity.get('longDescription'),
'duration': float_or_none(entity.get('durationMillis'), 1000),
'episode': episode,
'episode_number': int_or_none(entity.get('episodeNumber')),
# 'ie_key': 'BrightcoveNew',
'ie_key': TVAIE.ie_key(),
**traverse_obj(entity, {
'description': ('longDescription', {str}),
'duration': ('durationMillis', {functools.partial(float_or_none, scale=1000)}),
'channel': ('knownEntities', 'channel', 'name', {str}),
'series': ('knownEntities', 'videoShow', 'name', {str}),
'season_number': ('slug', {lambda x: re.search(r'/s(?:ai|ea)son-(\d+)/', x)}, 1, {int_or_none}),
'episode_number': ('episodeNumber', {int_or_none}),
}),
}

View File

@ -451,6 +451,7 @@ def _real_extract(self, url):
info_page, 'view count', default=None))
formats = []
subtitles = {}
for format_id, format_url in data.items():
format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
@ -462,12 +463,21 @@ def _real_extract(self, url):
formats.append({
'format_id': format_id,
'url': format_url,
'ext': 'mp4',
'source_preference': 1,
'height': height,
})
elif format_id == 'hls':
formats.extend(self._extract_m3u8_formats(
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False, live=is_live))
m3u8_id=format_id, fatal=False, live=is_live)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_id.startswith('dash_'):
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id=format_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_id == 'rtmp':
formats.append({
'format_id': format_id,
@ -475,7 +485,6 @@ def _real_extract(self, url):
'ext': 'flv',
})
subtitles = {}
for sub in data.get('subs') or {}:
subtitles.setdefault(sub.get('lang', 'en'), []).append({
'ext': sub.get('title', '.srt').split('.')[-1],
@ -496,6 +505,7 @@ def _real_extract(self, url):
'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live,
'subtitles': subtitles,
'_format_sort_fields': ('res', 'source'),
}

View File

@ -259,15 +259,15 @@ def _real_extract(self, url):
webpage = self._download_webpage(redirect, video_id, note='Redirecting')
data_json = self._search_json(
r'("data"\s*:|data\s*=)', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}')
serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)',
webpage, 'server state').replace('State', 'Settings')
serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state')
uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)',
webpage, 'uploader', default='<a>')
uploader_name = extract_attributes(uploader).get('aria-label')
video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict)
stream_urls = try_get(video_json, lambda x: x['video']['streams'])
item_id = traverse_obj(data_json, (serverstate, 'videoViewer', 'openedItemId', {str}))
video_json = traverse_obj(data_json, (serverstate, 'videoViewer', 'items', item_id, {dict})) or {}
formats, subtitles = [], {}
for s_url in stream_urls:
for s_url in traverse_obj(video_json, ('video', 'streams', ..., {url_or_none})):
ext = determine_ext(s_url)
if ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(s_url, video_id, mpd_id='dash')

View File

@ -72,15 +72,15 @@ class YouPornIE(InfoExtractor):
'id': '16290308',
'age_limit': 18,
'categories': [],
'description': 'md5:00ea70f642f431c379763c17c2f396bc',
'description': str, # TODO: detect/remove SEO spam description in ytdl backport
'display_id': 'tinderspecial-trailer1',
'duration': 298.0,
'ext': 'mp4',
'upload_date': '20201123',
'uploader': 'Ersties',
'tags': [],
'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg',
'timestamp': 1606089600,
'thumbnail': r're:https://.+\.jpg',
'timestamp': 1606147564,
'title': 'Tinder In Real Life',
'view_count': int,
}
@ -88,11 +88,17 @@ class YouPornIE(InfoExtractor):
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
definitions = self._download_json(
f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id)
self._set_cookie('.youporn.com', 'age_verified', '1')
webpage = self._download_webpage(f'https://www.youporn.com/watch/{video_id}', video_id)
definitions = self._search_json(r'\bplayervars\s*:', webpage, 'player vars', video_id)['mediaDefinitions']
def get_format_data(data, f):
return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl']))
def get_format_data(data, stream_type):
info_url = traverse_obj(data, (lambda _, v: v['format'] == stream_type, 'videoUrl', {url_or_none}, any))
if not info_url:
return []
return traverse_obj(
self._download_json(info_url, video_id, f'Downloading {stream_type} info JSON', fatal=False),
lambda _, v: v['format'] == stream_type and url_or_none(v['videoUrl']))
formats = []
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
@ -123,10 +129,6 @@ def get_format_data(data, f):
f['height'] = height
formats.append(f)
webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'})
title = self._html_search_regex(
r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
webpage, 'title', default=None) or self._og_search_title(