Compare commits

...

12 Commits

Author SHA1 Message Date
red-acid e64f64095d
Merge 8e977a039e into 5904853ae5 2024-05-06 17:33:54 +01:00
bashonly 5904853ae5
[ie/crunchyroll] Support browser impersonation (#9857)
Closes #7442
Authored by: bashonly
2024-05-05 23:15:32 +00:00
Chris Caruso c8bf48f3a8
[ie/cbc.ca:player] Improve `_VALID_URL` (#9866)
Closes #9825
Authored by: carusocr
2024-05-05 23:02:24 +00:00
The-MAGI 351368cb9a
[ie/youporn] Fix extractor (#8827)
Closes #7967
Authored by: The-MAGI
2024-05-05 22:57:38 +00:00
sepro 96da952504
[core] Warn if lack of ffmpeg alters format selection (#9805)
Authored by: seproDev, pukkandan
2024-05-05 00:44:08 +02:00
bashonly bec9a59e8e
[networking] Add `extensions` attribute to `Response` (#9756)
CurlCFFIRH now provides an `impersonate` field in its responses' extensions

Authored by: bashonly
2024-05-04 22:19:42 +00:00
bashonly 036e0d92c6
[ie/patreon] Extract multiple embeds (#9850)
Closes #9848
Authored by: bashonly
2024-05-04 22:11:11 +00:00
bashonly cb2fb4a643
[ie/crunchyroll] Always make metadata available (#9772)
Closes #9750
Authored by: bashonly
2024-05-04 16:15:44 +00:00
bashonly 231c2eacc4
[ie/soundcloud] Extract `genres` (#9821)
Authored by: bashonly
2024-05-04 16:14:36 +00:00
bashonly c4853655cb
[ie/wrestleuniverse] Avoid partial stream formats (#9800)
Authored by: bashonly
2024-05-04 16:07:15 +00:00
red-acid 8e977a039e
Update rtp.py based on some comments in youtube-dl#29824 2024-03-02 22:46:47 +00:00
red-acid afa82654fa
Update rtp.py 2024-03-02 18:18:26 +00:00
11 changed files with 391 additions and 147 deletions

View File

@ -785,6 +785,25 @@ def test_supported_impersonate_targets(self, handler):
assert res.status == 200
assert std_headers['user-agent'].lower() not in res.read().decode().lower()
def test_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_200', extensions={'impersonate': target})
res = validate_and_send(rh, request)
assert res.extensions['impersonate'] == rh._get_request_target(request)
def test_http_error_response_extensions(self, handler):
with handler() as rh:
for target in rh.supported_targets:
request = Request(
f'http://127.0.0.1:{self.http_port}/gen_404', extensions={'impersonate': target})
try:
validate_and_send(rh, request)
except HTTPError as e:
res = e.response
assert res.extensions['impersonate'] == rh._get_request_target(request)
class TestRequestHandlerMisc:
"""Misc generic tests for request handlers, not related to request or validation testing"""

View File

@ -2136,6 +2136,11 @@ def _filter(f):
def _check_formats(self, formats):
for f in formats:
working = f.get('__working')
if working is not None:
if working:
yield f
continue
self.to_screen('[info] Testing format %s' % f['format_id'])
path = self.get_output_path('temp')
if not self._ensure_dir_exists(f'{path}/'):
@ -2152,33 +2157,44 @@ def _check_formats(self, formats):
os.remove(temp_file.name)
except OSError:
self.report_warning('Unable to delete temporary file "%s"' % temp_file.name)
f['__working'] = success
if success:
yield f
else:
self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id'])
def _select_formats(self, formats, selector):
return list(selector({
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
def _default_format_spec(self, info_dict, download=True):
download = download and not self.params.get('simulate')
prefer_best = download and (
self.params['outtmpl']['default'] == '-'
or info_dict.get('is_live') and not self.params.get('live_from_start'))
def can_merge():
merger = FFmpegMergerPP(self)
return merger.available and merger.can_merge()
prefer_best = (
not self.params.get('simulate')
and download
and (
not can_merge()
or info_dict.get('is_live') and not self.params.get('live_from_start')
or self.params['outtmpl']['default'] == '-'))
compat = (
prefer_best
or self.params.get('allow_multiple_audio_streams', False)
or 'format-spec' in self.params['compat_opts'])
if not prefer_best and download and not can_merge():
prefer_best = True
formats = self._get_formats(info_dict)
evaluate_formats = lambda spec: self._select_formats(formats, self.build_format_selector(spec))
if evaluate_formats('b/bv+ba') != evaluate_formats('bv*+ba/b'):
self.report_warning('ffmpeg not found. The downloaded format may not be the best available. '
'Installing ffmpeg is strongly recommended: https://github.com/yt-dlp/yt-dlp#dependencies')
return (
'best/bestvideo+bestaudio' if prefer_best
else 'bestvideo*+bestaudio/best' if not compat
else 'bestvideo+bestaudio/best')
compat = (self.params.get('allow_multiple_audio_streams')
or 'format-spec' in self.params['compat_opts'])
return ('best/bestvideo+bestaudio' if prefer_best
else 'bestvideo+bestaudio/best' if compat
else 'bestvideo*+bestaudio/best')
def build_format_selector(self, format_spec):
def syntax_error(note, start):
@ -2928,12 +2944,7 @@ def is_wellformed(f):
self.write_debug(f'Default format spec: {req_format}')
format_selector = self.build_format_selector(req_format)
formats_to_download = list(format_selector({
'formats': formats,
'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats),
'incomplete_formats': (all(f.get('vcodec') == 'none' for f in formats) # No formats with video
or all(f.get('acodec') == 'none' for f in formats)), # OR, No formats with audio
}))
formats_to_download = self._select_formats(formats, format_selector)
if interactive_format_selection and not formats_to_download:
self.report_error('Requested format is not available', tb=False, is_error=False)
continue

View File

@ -151,7 +151,7 @@ def _real_extract(self, url):
class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c',
@ -277,6 +277,28 @@ class CBCPlayerIE(InfoExtractor):
'location': 'Canada',
'media_type': 'Full Program',
},
}, {
'url': 'https://www.cbc.ca/player/play/video/1.7194274',
'md5': '188b96cf6bdcb2540e178a6caa957128',
'info_dict': {
'id': '2334524995812',
'ext': 'mp4',
'title': '#TheMoment a rare white spirit moose was spotted in Alberta',
'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3',
'timestamp': 1714788791,
'duration': 77.678,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg',
'uploader': 'CBCC-NEW',
'chapters': 'count:0',
'upload_date': '20240504',
'categories': 'count:3',
'series': 'The National',
'tags': 'count:15',
'creators': ['encoder'],
'location': 'Canada',
'media_type': 'Excerpt',
},
}, {
'url': 'cbcplayer:1.7159484',
'only_matching': True,

View File

@ -53,15 +53,19 @@ def _set_auth_info(self, response):
CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10)
def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'):
try: # TODO: Add impersonation support here
try:
return self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote,
headers=headers, data=urlencode_postdata(data))
headers=headers, data=urlencode_postdata(data), impersonate=True)
except ExtractorError as error:
if not isinstance(error.cause, HTTPError) or error.cause.status != 403:
raise
if target := error.cause.response.extensions.get('impersonate'):
raise ExtractorError(f'Got HTTP Error 403 when using impersonate target "{target}"')
raise ExtractorError(
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
'Request blocked by Cloudflare. '
'Install the required impersonation dependency if possible, '
'or else navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)
@ -394,10 +398,11 @@ def entries():
if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
message = f'This {object_type} is for premium members only'
if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True)
self.raise_login_required(message, method='password')
result['formats'], result['subtitles'] = self._extract_stream(internal_id)
self.raise_no_formats(message, expected=True, video_id=internal_id)
else:
self.raise_login_required(message, method='password', metadata_available=True)
else:
result['formats'], result['subtitles'] = self._extract_stream(internal_id)
result['chapters'] = self._extract_chapters(internal_id)
@ -583,14 +588,16 @@ def _real_extract(self, url):
if not response:
raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
result = self._transform_music_response(response)
if not self._IS_PREMIUM and response.get('isPremiumOnly'):
message = f'This {response.get("type") or "media"} is for premium members only'
if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True)
self.raise_login_required(message, method='password')
result = self._transform_music_response(response)
result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
self.raise_no_formats(message, expected=True, video_id=internal_id)
else:
self.raise_login_required(message, method='password', metadata_available=True)
else:
result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
return result

View File

@ -219,7 +219,29 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': r're:^https?://.+',
},
'params': {'skip_download': 'm3u8'},
}, {
# multiple attachments/embeds
'url': 'https://www.patreon.com/posts/holy-wars-solos-100601977',
'playlist_count': 3,
'info_dict': {
'id': '100601977',
'title': '"Holy Wars" (Megadeth) Solos Transcription & Lesson/Analysis',
'description': 'md5:d099ab976edfce6de2a65c2b169a88d3',
'uploader': 'Bradley Hall',
'uploader_id': '24401883',
'uploader_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_id': '3193932',
'channel_url': 'https://www.patreon.com/bradleyhallguitar',
'channel_follower_count': int,
'timestamp': 1710777855,
'upload_date': '20240318',
'like_count': int,
'comment_count': int,
'thumbnail': r're:^https?://.+',
},
'skip': 'Patron-only content',
}]
_RETURN_TYPE = 'video'
def _real_extract(self, url):
video_id = self._match_id(url)
@ -234,58 +256,54 @@ def _real_extract(self, url):
'include': 'audio,user,user_defined_tags,campaign,attachments_media',
})
attributes = post['data']['attributes']
title = attributes['title'].strip()
image = attributes.get('image') or {}
info = {
'id': video_id,
'title': title,
'description': clean_html(attributes.get('content')),
'thumbnail': image.get('large_url') or image.get('url'),
'timestamp': parse_iso8601(attributes.get('published_at')),
'like_count': int_or_none(attributes.get('like_count')),
'comment_count': int_or_none(attributes.get('comment_count')),
}
can_view_post = traverse_obj(attributes, 'current_user_can_view')
if can_view_post and info['comment_count']:
info['__post_extractor'] = self.extract_comments(video_id)
info = traverse_obj(attributes, {
'title': ('title', {str.strip}),
'description': ('content', {clean_html}),
'thumbnail': ('image', ('large_url', 'url'), {url_or_none}, any),
'timestamp': ('published_at', {parse_iso8601}),
'like_count': ('like_count', {int_or_none}),
'comment_count': ('comment_count', {int_or_none}),
})
for i in post.get('included', []):
i_type = i.get('type')
if i_type == 'media':
media_attributes = i.get('attributes') or {}
download_url = media_attributes.get('download_url')
entries = []
idx = 0
for include in traverse_obj(post, ('included', lambda _, v: v['type'])):
include_type = include['type']
if include_type == 'media':
media_attributes = traverse_obj(include, ('attributes', {dict})) or {}
download_url = url_or_none(media_attributes.get('download_url'))
ext = mimetype2ext(media_attributes.get('mimetype'))
# if size_bytes is None, this media file is likely unavailable
# See: https://github.com/yt-dlp/yt-dlp/issues/4608
size_bytes = int_or_none(media_attributes.get('size_bytes'))
if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None:
# XXX: what happens if there are multiple attachments?
return {
**info,
idx += 1
entries.append({
'id': f'{video_id}-{idx}',
'ext': ext,
'filesize': size_bytes,
'url': download_url,
}
elif i_type == 'user':
user_attributes = i.get('attributes')
if user_attributes:
info.update({
'uploader': user_attributes.get('full_name'),
'uploader_id': str_or_none(i.get('id')),
'uploader_url': user_attributes.get('url'),
})
elif i_type == 'post_tag':
info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value')))
elif include_type == 'user':
info.update(traverse_obj(include, {
'uploader': ('attributes', 'full_name', {str}),
'uploader_id': ('id', {str_or_none}),
'uploader_url': ('attributes', 'url', {url_or_none}),
}))
elif i_type == 'campaign':
info.update({
'channel': traverse_obj(i, ('attributes', 'title')),
'channel_id': str_or_none(i.get('id')),
'channel_url': traverse_obj(i, ('attributes', 'url')),
'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))),
})
elif include_type == 'post_tag':
if post_tag := traverse_obj(include, ('attributes', 'value', {str})):
info.setdefault('tags', []).append(post_tag)
elif include_type == 'campaign':
info.update(traverse_obj(include, {
'channel': ('attributes', 'title', {str}),
'channel_id': ('id', {str_or_none}),
'channel_url': ('attributes', 'url', {url_or_none}),
'channel_follower_count': ('attributes', 'patron_count', {int_or_none}),
}))
# handle Vimeo embeds
if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
@ -296,36 +314,50 @@ def _real_extract(self, url):
v_url, video_id, 'Checking Vimeo embed URL',
headers={'Referer': 'https://patreon.com/'},
fatal=False, errnote=False):
return self.url_result(
entries.append(self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
VimeoIE, url_transparent=True, **info)
VimeoIE, url_transparent=True))
embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
return self.url_result(embed_url, **info)
entries.append(self.url_result(embed_url))
post_file = traverse_obj(attributes, 'post_file')
post_file = traverse_obj(attributes, ('post_file', {dict}))
if post_file:
name = post_file.get('name')
ext = determine_ext(name)
if ext in KNOWN_EXTENSIONS:
return {
**info,
entries.append({
'id': video_id,
'ext': ext,
'url': post_file['url'],
}
})
elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id)
return {
**info,
entries.append({
'id': video_id,
'formats': formats,
'subtitles': subtitles,
}
})
if can_view_post is False:
can_view_post = traverse_obj(attributes, 'current_user_can_view')
comments = None
if can_view_post and info.get('comment_count'):
comments = self.extract_comments(video_id)
if not entries and can_view_post is False:
self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True)
else:
elif not entries:
self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True)
elif len(entries) == 1:
info.update(entries[0])
else:
for entry in entries:
entry.update(info)
return self.playlist_result(entries, video_id, **info, __post_extractor=comments)
info['id'] = video_id
info['__post_extractor'] = comments
return info
def _get_comments(self, post_id):

View File

@ -1,5 +1,11 @@
from .common import InfoExtractor
from ..utils import js_to_json
from ..utils import (
ExtractorError,
RegexNotFoundError,
determine_ext,
join_nonempty,
js_to_json,
)
import re
import json
import urllib.parse
@ -7,19 +13,72 @@
class RTPIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
_VALID_URL = r'https?://(?:(?:(?:www\.)?rtp\.pt/play/(?P<subarea>.*/)?p(?P<program_id>[0-9]+)/)|(?:arquivos\.rtp\.pt/conteudos/))(?P<id>[^/?#]+)/?'
_TESTS = [{
'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
'md5': 'e736ce0c665e459ddb818546220b4ef8',
'url': 'https://www.rtp.pt/play/p9165/e562949/por-do-sol',
'info_dict': {
'id': 'e174042',
'ext': 'mp3',
'title': 'Paixões Cruzadas',
'description': 'As paixões musicais de António Cartaxo e António Macedo',
'thumbnail': r're:^https?://.*\.jpg',
'id': 'e562949',
'ext': 'mp4',
'title': 'Pôr do Sol Episódio 1',
'description': 'Madalena Bourbon de Linhaça vive atormentada pelo segredo que esconde desde 1990. Matilde Bourbon de Linhaça sonha fugir com o seu amor proibido. O',
'thumbnail': r're:^https?://.*\.(jpg|png)'
},
}, {
'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
'url': 'https://www.rtp.pt/play/p12646/e738493/telejornal',
'info_dict': {
'id': 'e738493',
'ext': 'mp4',
'title': 'Telejornal de 01 jan 2024 PARTE 1',
'description': 'A mais rigorosa seleção de notícias, todos os dias às 20h00. De segunda a domingo, João Adelino Faria, José Rodrigues dos Santos e Ana Lourenço',
'thumbnail': r're:^https?://.*\.(jpg|png)'
},
}, {
'url': 'https://www.rtp.pt/play/p6646/e457262/grande-entrevista',
'info_dict': {
'id': 'e457262',
'ext': 'mp4',
'title': 'Grande Entrevista Episódio 7 - de 19 fev 2020',
'description': 'Bruno Nogueira - É um dos mais originais humoristas portugueses e de maior êxito! Bruno Nogueira na Grande Entrevista com Vítor Gonçalves.',
'thumbnail': r're:^https?://.*\.(jpg|png)'
},
}, {
'url': 'https://www.rtp.pt/play/p8064/e750623/fronteira',
'info_dict': {
'id': 'e750623',
'ext': 'mp4',
'title': 'Fronteira de 26 fev 2024',
'description': '1970. À aldeia de Fronteira chega um novo chefe de posto da Guarda Fiscal. Com convicções inabaláveis sobre a aplicação da Lei, rapidamente entr',
'thumbnail': r're:^https?://.*\.(jpg|png)'
},
}, {
'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/e539826/portugues-1-ano',
'info_dict': {
'id': 'e539826',
'ext': 'mp4',
'title': 'Português - 1.º ano , aula 45 - 27 abr 2021 - Estudo Em Casa - RTP',
'description': 'A História do Pedrito Coelho, de Beatrix Potter. O dígrafo \'lh\' - A História do Pedrito Coelho, de Beatrix Potter. O dígrafo \'lh\'.',
'thumbnail': r're:^https?://.*\.(jpg|png)'
},
}, {
'url': 'https://www.rtp.pt/play/zigzag/p11099/e747372/coelhos-corajosos',
'info_dict': {
'id': 'e747372',
'ext': 'mp4',
'title': 'Coelhos Corajosos Episódio 1 - de 12 fev 2024 - Zig Zag Play - RTP',
'description': 'Boo e o seu irmão mais velho, Bop, vivem grandes aventuras com os seus amigos, e com os seus quatro irmãos pequeninos. Juntos e com muita coragem, e',
'thumbnail': r're:^https?://.*\.(jpg|png)'
},
}, {
'url': 'https://arquivos.rtp.pt/conteudos/liga-dos-ultimos-152/',
'info_dict': {
'id': 'liga-dos-ultimos-152',
'ext': 'mp4',
'title': 'Liga dos Últimos RTP Arquivos',
'description': 'Magazine desportivo, com apresentação de Álvaro Costa e comentários em estúdio do professor Hernâni Gonçalves e do sociólogo João Nuno Coelho. Destaque para os jogos de futebol das equipas dos escalões secundários de Portugal, com momentos dos jogos: Agrário de Lamas vs Pampilhoense e Apúlia vs Fragoso.',
'thumbnail': r're:^https?://.*\.(jpg|png)'
},
}, {
'url': 'https://www.rtp.pt/play/p510/aleixo-fm',
'only_matching': True,
}]
@ -43,42 +102,78 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta(
'twitter:title', webpage, display_name='title', fatal=True)
f, config = self._search_regex(
r'''(?sx)
var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
''', webpage,
'player config', group=('f', 'config'))
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title', default='')
f = self._parse_json(
f, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id))
config = self._parse_json(
config, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id))
# Raise error if episode is unavailable
if 'Este episódio não se encontra disponível' in title:
raise ExtractorError('Episode unavailable', expected=True)
# Replace irrelevant string in title
title = re.sub(r' - ?RTP Play - RTP', '', title)
# Check if it's a program split in parts
part = self._html_search_regex(r'section\-parts.*<span.*>(.+?)</span>.*</ul>', webpage, 'part', default=None)
# Add program part identification to title if it exists
title = join_nonempty(title, part, delim=' ')
try:
# Extract f and config from page
f, config = self._search_regex(
r'''(?sx)
var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*
var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
''', webpage,
'player config', group=('f', 'config'))
f = self._parse_json(
f, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id))
config = self._parse_json(
config, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id))
config['file'] = f
except RegexNotFoundError:
# Estudo em Casa / Zig Zag / RTP Arquivos pages don't include f
config = self._search_regex(
r'''(?sx)
var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
''', webpage,
'just player config')
config = self._parse_json(
config, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id))
formats = []
if isinstance(f, dict):
f_hls = f.get('hls')
if f_hls is not None:
formats.extend(self._extract_m3u8_formats(
f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
file = config.get('file')
if isinstance(file, dict):
file_hls = file.get('hls')
file_fps = file.get('fps')
f_dash = f.get('dash')
if f_dash is not None:
formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
if file_hls is None and file_fps is not None:
file_hls = file_fps.replace('drm-fps', 'hls')
formats.extend(self._extract_m3u8_formats(
file_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
else:
formats.append({
'format_id': 'f',
'url': f,
'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
})
ext = determine_ext(file)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
file, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
else:
formats.append({
'format_id': 'f',
'url': file,
'vcodec': 'none' if config.get('mediaType') == 'audio' else None,
})
subtitles = {}
vtt = config.get('vtt')
if vtt is not None:
for lcode, lname, url in vtt:

View File

@ -361,7 +361,7 @@ def extract_count(key):
'like_count': extract_count('favoritings') or extract_count('likes'),
'comment_count': extract_count('comment'),
'repost_count': extract_count('reposts'),
'genre': info.get('genre'),
'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)),
'formats': formats if not extract_flat else None
}
@ -395,10 +395,10 @@ class SoundcloudIE(SoundcloudBaseIE):
_TESTS = [
{
'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
'md5': 'de9bac153e7427a7333b4b0c1b6a18d2',
'info_dict': {
'id': '62986583',
'ext': 'mp3',
'ext': 'opus',
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music',
@ -411,6 +411,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg',
'uploader_url': 'https://soundcloud.com/ethmusic',
'genres': [],
}
},
# geo-restricted
@ -418,7 +421,7 @@ class SoundcloudIE(SoundcloudBaseIE):
'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
'info_dict': {
'id': '47127627',
'ext': 'mp3',
'ext': 'opus',
'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept',
@ -431,6 +434,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
'uploader_url': 'https://soundcloud.com/the-concept-band',
'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg',
'genres': ['Alternative'],
},
},
# private link
@ -452,6 +458,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
'genres': ['youtubedl'],
},
},
# private link (alt format)
@ -473,6 +482,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
'uploader_url': 'https://soundcloud.com/jaimemf',
'thumbnail': 'https://a1.sndcdn.com/images/default_avatar_large.png',
'genres': ['youtubedl'],
},
},
# downloadable song
@ -482,6 +494,21 @@ class SoundcloudIE(SoundcloudBaseIE):
'info_dict': {
'id': '343609555',
'ext': 'wav',
'title': 'The Following',
'description': '',
'uploader': '80M',
'uploader_id': '312384765',
'uploader_url': 'https://soundcloud.com/the80m',
'upload_date': '20170922',
'timestamp': 1506120436,
'duration': 397.228,
'thumbnail': 'https://i1.sndcdn.com/artworks-000243916348-ktoo7d-original.jpg',
'license': 'all-rights-reserved',
'like_count': int,
'comment_count': int,
'repost_count': int,
'view_count': int,
'genres': ['Dance & EDM'],
},
},
# private link, downloadable format
@ -503,6 +530,9 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg',
'uploader_url': 'https://soundcloud.com/oriuplift',
'genres': ['Trance'],
},
},
# no album art, use avatar pic for thumbnail
@ -525,6 +555,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
'uploader_url': 'https://soundcloud.com/garyvee',
'genres': [],
},
'params': {
'skip_download': True,
@ -532,13 +564,13 @@ class SoundcloudIE(SoundcloudBaseIE):
},
{
'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7',
'md5': '8227c3473a4264df6b02ad7e5b7527ac',
'info_dict': {
'id': '583011102',
'ext': 'mp3',
'ext': 'opus',
'title': 'Mezzo Valzer',
'description': 'md5:4138d582f81866a530317bae316e8b61',
'uploader': 'Micronie',
'description': 'md5:f4d5f39d52e0ccc2b4f665326428901a',
'uploader': 'Giovanni Sarani',
'uploader_id': '3352531',
'timestamp': 1551394171,
'upload_date': '20190228',
@ -549,6 +581,8 @@ class SoundcloudIE(SoundcloudBaseIE):
'like_count': int,
'comment_count': int,
'repost_count': int,
'genres': ['Piano'],
'uploader_url': 'https://soundcloud.com/giovannisarani',
},
},
{

View File

@ -12,6 +12,7 @@
jwt_decode_hs256,
traverse_obj,
try_call,
url_basename,
url_or_none,
urlencode_postdata,
variadic,
@ -194,8 +195,7 @@ def _real_extract(self, url):
return {
'id': video_id,
'formats': self._get_formats(video_data, (
(('protocolHls', 'url'), ('chromecastUrls', ...)), {url_or_none}), video_id),
'formats': self._get_formats(video_data, ('protocolHls', 'url', {url_or_none}), video_id),
**traverse_obj(metadata, {
'title': ('displayName', {str}),
'description': ('description', {str}),
@ -259,6 +259,10 @@ class WrestleUniversePPVIE(WrestleUniverseBaseIE):
'params': {
'skip_download': 'm3u8',
},
}, {
'note': 'manifest provides live-a (partial) and live-b (full) streams',
'url': 'https://www.wrestle-universe.com/en/lives/umc99R9XsexXrxr9VjTo9g',
'only_matching': True,
}]
_API_PATH = 'events'
@ -285,12 +289,16 @@ def _real_extract(self, url):
video_data, decrypt = self._call_encrypted_api(
video_id, ':watchArchive', 'watch archive', data={'method': 1})
info['formats'] = self._get_formats(video_data, (
('hls', None), ('urls', 'chromecastUrls'), ..., {url_or_none}), video_id)
# 'chromecastUrls' can be only partial videos, avoid
info['formats'] = self._get_formats(video_data, ('hls', (('urls', ...), 'url'), {url_or_none}), video_id)
for f in info['formats']:
# bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values
if f.get('tbr'):
f['tbr'] = int(f['tbr'] / 2.5)
# prefer variants with the same basename as the master playlist to avoid partial streams
f['format_id'] = url_basename(f['url']).partition('.')[0]
if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]):
f['preference'] = -10
hls_aes_key = traverse_obj(video_data, ('hls', 'key', {decrypt}))
if hls_aes_key:

View File

@ -72,15 +72,15 @@ class YouPornIE(InfoExtractor):
'id': '16290308',
'age_limit': 18,
'categories': [],
'description': 'md5:00ea70f642f431c379763c17c2f396bc',
'description': str, # TODO: detect/remove SEO spam description in ytdl backport
'display_id': 'tinderspecial-trailer1',
'duration': 298.0,
'ext': 'mp4',
'upload_date': '20201123',
'uploader': 'Ersties',
'tags': [],
'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg',
'timestamp': 1606089600,
'thumbnail': r're:https://.+\.jpg',
'timestamp': 1606147564,
'title': 'Tinder In Real Life',
'view_count': int,
}
@ -88,11 +88,17 @@ class YouPornIE(InfoExtractor):
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
definitions = self._download_json(
f'https://www.youporn.com/api/video/media_definitions/{video_id}/', display_id or video_id)
self._set_cookie('.youporn.com', 'age_verified', '1')
webpage = self._download_webpage(f'https://www.youporn.com/watch/{video_id}', video_id)
definitions = self._search_json(r'\bplayervars\s*:', webpage, 'player vars', video_id)['mediaDefinitions']
def get_format_data(data, f):
return traverse_obj(data, lambda _, v: v['format'] == f and url_or_none(v['videoUrl']))
def get_format_data(data, stream_type):
info_url = traverse_obj(data, (lambda _, v: v['format'] == stream_type, 'videoUrl', {url_or_none}, any))
if not info_url:
return []
return traverse_obj(
self._download_json(info_url, video_id, f'Downloading {stream_type} info JSON', fatal=False),
lambda _, v: v['format'] == stream_type and url_or_none(v['videoUrl']))
formats = []
# Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s
@ -123,10 +129,6 @@ def get_format_data(data, f):
f['height'] = height
formats.append(f)
webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'})
title = self._html_search_regex(
r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
webpage, 'title', default=None) or self._og_search_title(

View File

@ -132,6 +132,16 @@ def _check_extensions(self, extensions):
extensions.pop('cookiejar', None)
extensions.pop('timeout', None)
def send(self, request: Request) -> Response:
target = self._get_request_target(request)
try:
response = super().send(request)
except HTTPError as e:
e.response.extensions['impersonate'] = target
raise
response.extensions['impersonate'] = target
return response
def _send(self, request: Request):
max_redirects_exceeded = False
session: curl_cffi.requests.Session = self._get_instance(

View File

@ -497,6 +497,7 @@ class Response(io.IOBase):
@param headers: response headers.
@param status: Response HTTP status code. Default is 200 OK.
@param reason: HTTP status reason. Will use built-in reasons based on status code if not provided.
@param extensions: Dictionary of handler-specific response extensions.
"""
def __init__(
@ -505,7 +506,9 @@ def __init__(
url: str,
headers: Mapping[str, str],
status: int = 200,
reason: str = None):
reason: str = None,
extensions: dict = None
):
self.fp = fp
self.headers = Message()
@ -517,6 +520,7 @@ def __init__(
self.reason = reason or HTTPStatus(status).phrase
except ValueError:
self.reason = None
self.extensions = extensions or {}
def readable(self):
return self.fp.readable()