Compare commits

...

9 Commits

Author SHA1 Message Date
Subrat Lima
7700eb923b
Merge 2c0244cb2f into a9f85670d0 2024-11-12 09:13:01 +01:00
manav_chaudhary
a9f85670d0
[ie/Chaturbate] Support alternate domains (#10595)
Closes #10594
Authored by: manavchaudhary1
2024-11-11 23:41:56 +01:00
Sam
6b43a8d84b
[ie/goplay] Fix extractor (#11466)
Closes #10857
Authored by: SamDecrock, bashonly

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2024-11-11 22:03:31 +00:00
Hugo
2db8c2e7d5
[ie/CloudflareStream] Avoid extraction via videodelivery.net (#11478)
Closes #11477
Authored by: hugovdev
2024-11-11 22:00:05 +00:00
bashonly
f9c8deb4e5
[build] Bump PyInstaller version pin to >=6.11.1 (#11507)
Authored by: bashonly
2024-11-11 21:19:03 +00:00
Sakura286
0ec9bfed4d
[ie/MixchMovie] Add extractor (#10897)
Closes #10765
Authored by: Sakura286
2024-11-11 21:40:29 +01:00
subrat-lima
2c0244cb2f [ie/atptour] refactored url pattern for better extensibility 2024-09-01 21:46:17 +05:30
subrat-lima
2fe0226c0f [ie/atptour] enhancement - added support for spanish pages 2024-09-01 20:17:25 +05:30
subrat-lima
485cbe4990 [ie/atptour] add extractor and updated data extraction function 2024-09-01 17:06:24 +05:30
9 changed files with 273 additions and 53 deletions

View File

@ -411,7 +411,7 @@ jobs:
run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds
python devscripts/install_deps.py -o --include build
python devscripts/install_deps.py --include curl-cffi
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.10.0-py3-none-any.whl"
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-6.11.1-py3-none-any.whl"
- name: Prepare
run: |
@ -460,7 +460,7 @@ jobs:
run: |
python devscripts/install_deps.py -o --include build
python devscripts/install_deps.py
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.10.0-py3-none-any.whl"
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/i686/pyinstaller-6.11.1-py3-none-any.whl"
- name: Prepare
run: |

View File

@ -83,7 +83,7 @@ test = [
"pytest-rerunfailures~=14.0",
]
pyinstaller = [
"pyinstaller>=6.10.0", # Windows temp cleanup fixed in 6.10.0
"pyinstaller>=6.11.1", # Windows temp cleanup fixed in 6.11.1
]
[project.urls]

View File

@ -169,6 +169,10 @@ from .asobichannel import (
AsobiChannelTagURLIE,
)
from .asobistage import AsobiStageIE
from .atptour import (
ATPTourNewsIE,
ATPTourVideoIE,
)
from .atresplayer import AtresPlayerIE
from .atscaleconf import AtScaleConfEventIE
from .atvat import ATVAtIE
@ -1156,6 +1160,7 @@ from .mitele import MiTeleIE
from .mixch import (
MixchArchiveIE,
MixchIE,
MixchMovieIE,
)
from .mixcloud import (
MixcloudIE,

127
yt_dlp/extractor/atptour.py Normal file
View File

@ -0,0 +1,127 @@
import re
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from ..utils import base_url, extract_attributes, get_element_html_by_id, traverse_obj, urljoin
class ATPTourVideoIE(InfoExtractor):
IE_NAME = 'atptour:video'
_VALID_URL = r'https?://(?:www\.)?atptour\.com/(?:en|es)/video/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.atptour.com/en/video/challenger-highlights-nishikori-wins-in-como-2024',
'md5': '4721002227d98fe89afafa40eba3068d',
'info_dict': {
'id': '6361099221112',
'ext': 'mp4',
'description': 'md5:ef8afed21c52cbe4ad3409045d59f413',
'upload_date': '20240827',
'duration': 105.152,
'tags': 'count:6',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Challenger Highlights: Nishikori wins in Como 2024',
'uploader_id': '6057277721001',
'timestamp': 1724775281,
},
}, {
'url': 'https://www.atptour.com/en/video/highlights-svajda-earns-highestranked-win-of-career-vs-cerundolo-winstonsalem-2024',
'md5': 'a3829d10bdcb1829568fd88b9e6ecb15',
'info_dict': {
'id': '6360716257112',
'ext': 'mp4',
'description': 'md5:a334aeb73eac631ffab8249b1e68194c',
'upload_date': '20240820',
'duration': 139.691,
'tags': 'count:5',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Highlights: Svajda earns highest-ranked win of career vs. Cerundolo Winston-Salem 2024',
'uploader_id': '6057277721001',
'timestamp': 1724183755,
},
}, {
'url': 'https://www.atptour.com/es/video/highlights-michelsen-defeats-fucsovics-in-winston-salem-2024',
'md5': '7ba4c3aabef9eb20a1b9877f28e6f775',
'info_dict': {
'id': '6360727636112',
'ext': 'mp4',
'description': 'md5:2c5682fdfa514e508c6d947e9e9b6eeb',
'upload_date': '20240821',
'duration': 135.424,
'tags': 'count:6',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Highlights: Michelsen defeats Fucsovics in Winston-Salem 2024',
'uploader_id': '6057277721001',
'timestamp': 1724205624,
},
}, {
'url': 'https://www.atptour.com/en/video/highlights-sonego-dominates-michelsen-for-winston-salem-open-title-2024',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True)
hidden_inputs = self._hidden_inputs(webpage, 'class')
featured_videos_url = urljoin(base_url(url), hidden_inputs.get('atp_featured-videos-endpoint'))
json_data = self._download_json(featured_videos_url, display_id, fatal=False, impersonate=True)
video_data = traverse_obj(json_data, ('content', 0))
account_id = traverse_obj(video_data, ('videoAccountId'))
player_id = traverse_obj(video_data, ('videoPlayerId'))
video_id = traverse_obj(video_data, ('videoId'))
return self.url_result(
f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}', BrightcoveNewIE)
class ATPTourNewsIE(InfoExtractor):
IE_NAME = 'atptour:news'
_VALID_URL = r'https?://(?:www\.)?atptour\.com/(?:en|es)/news/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.atptour.com/en/news/sinner-zverev-cincinnati-2024-sf',
'playlist_mincount': 2,
'info_dict': {
'id': 'sinner-zverev-cincinnati-2024-sf',
'title': 'Jannik Sinner battles past Alexander Zverev to reach Cincinnati final | ATP Tour | Tennis',
'description': 'md5:30cd3df666c8a5d45731d1e85d8d43ae',
},
}, {
'url': 'https://www.atptour.com/en/news/borges-us-open-2024-this-is-tennis',
'playlist_mincount': 1,
'info_dict': {
'id': 'borges-us-open-2024-this-is-tennis',
'title': 'Nuno Borges: Building legos, facing Nadal, Cirque du Soleil & more | ATP Tour | Tennis',
'description': 'md5:aaef866660c4e3ced69118c0f6ed237a',
},
}, {
'url': 'https://www.atptour.com/es/news/popyrin-us-open-2024-feature',
'playlist_mincount': 1,
'info_dict': {
'id': 'popyrin-us-open-2024-feature',
'title': 'Alexei Popyrin: Hamilton, pollo frito y la revancha de Djokovic | ATP Tour | Tennis',
'description': 'md5:b62a35720a278c9ab8410847915dc581',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True)
title = self._html_extract_title(webpage)
description = self._og_search_description(webpage)
entries = []
first_video = get_element_html_by_id('articleVideoJSPlayer', webpage)
if first_video is not None:
attributes = extract_attributes(first_video)
account_id = traverse_obj(attributes, ('data-account'))
player_id = traverse_obj(attributes, ('data-player'))
video_id = traverse_obj(attributes, ('data-video-id'))
first_video_url = f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}'
entries.append(self.url_result(first_video_url, BrightcoveNewIE))
iframe_urls = re.findall(r'<iframe[^>]src="(https://players\.brightcove\.net/[^"]+)"', webpage)
for video_url in iframe_urls:
entries.append(self.url_result(video_url, BrightcoveNewIE))
return self.playlist_result(entries, display_id, title, description)

View File

@ -9,7 +9,7 @@ from ..utils import (
class ChaturbateIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
_VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.(?P<tld>com|eu|global)/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://www.chaturbate.com/siswet19/',
'info_dict': {
@ -29,15 +29,24 @@ class ChaturbateIE(InfoExtractor):
}, {
'url': 'https://en.chaturbate.com/siswet19/',
'only_matching': True,
}, {
'url': 'https://chaturbate.eu/siswet19/',
'only_matching': True,
}, {
'url': 'https://chaturbate.eu/fullvideo/?b=caylin',
'only_matching': True,
}, {
'url': 'https://chaturbate.global/siswet19/',
'only_matching': True,
}]
_ROOM_OFFLINE = 'Room is currently offline'
def _real_extract(self, url):
video_id = self._match_id(url)
video_id, tld = self._match_valid_url(url).group('id', 'tld')
webpage = self._download_webpage(
f'https://chaturbate.com/{video_id}/', video_id,
f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers())
found_m3u8_urls = []

View File

@ -8,7 +8,7 @@ class CloudflareStreamIE(InfoExtractor):
_DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
_EMBED_RE = rf'(?:embed\.|{_SUBDOMAIN_RE}){_DOMAIN_RE}/embed/[^/?#]+\.js\?(?:[^#]+&)?video='
_ID_RE = r'[\da-f]{32}|eyJ[\w-]+\.[\w-]+\.[\w-]+'
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}{_DOMAIN_RE}/|{_EMBED_RE})(?P<id>{_ID_RE})'
_VALID_URL = rf'https?://(?:{_SUBDOMAIN_RE}(?P<domain>{_DOMAIN_RE})/|{_EMBED_RE})(?P<id>{_ID_RE})'
_EMBED_REGEX = [
rf'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE})(?:(?!\1).)*)\1',
rf'<iframe[^>]+\bsrc=["\'](?P<url>https?://{_SUBDOMAIN_RE}{_DOMAIN_RE}/[\da-f]{{32}})',
@ -19,7 +19,7 @@ class CloudflareStreamIE(InfoExtractor):
'id': '31c9291ab41fac05471db4e73aa11717',
'ext': 'mp4',
'title': '31c9291ab41fac05471db4e73aa11717',
'thumbnail': 'https://videodelivery.net/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg',
'thumbnail': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': 'm3u8',
@ -30,7 +30,7 @@ class CloudflareStreamIE(InfoExtractor):
'id': '0e8e040aec776862e1d632a699edf59e',
'ext': 'mp4',
'title': '0e8e040aec776862e1d632a699edf59e',
'thumbnail': 'https://videodelivery.net/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg',
'thumbnail': 'https://cloudflarestream.com/0e8e040aec776862e1d632a699edf59e/thumbnails/thumbnail.jpg',
},
}, {
'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
@ -54,7 +54,7 @@ class CloudflareStreamIE(InfoExtractor):
'id': 'eaef9dea5159cf968be84241b5cedfe7',
'ext': 'mp4',
'title': 'eaef9dea5159cf968be84241b5cedfe7',
'thumbnail': 'https://videodelivery.net/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg',
'thumbnail': 'https://cloudflarestream.com/eaef9dea5159cf968be84241b5cedfe7/thumbnails/thumbnail.jpg',
},
'params': {
'skip_download': 'm3u8',
@ -62,8 +62,9 @@ class CloudflareStreamIE(InfoExtractor):
}]
def _real_extract(self, url):
video_id = self._match_id(url)
domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net'
video_id, domain = self._match_valid_url(url).group('id', 'domain')
if domain != 'bytehighway.net':
domain = 'cloudflarestream.com'
base_url = f'https://{domain}/{video_id}/'
if '.' in video_id:
video_id = self._parse_json(base64.urlsafe_b64decode(

View File

@ -1801,7 +1801,7 @@ class InfoExtractor:
return traverse_obj(ret, traverse) or {}
@staticmethod
def _hidden_inputs(html):
def _hidden_inputs(html, attr_list=('name', 'id')):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
hidden_inputs = {}
for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
@ -1810,7 +1810,10 @@ class InfoExtractor:
continue
if attrs.get('type') not in ('hidden', 'submit'):
continue
name = attrs.get('name') or attrs.get('id')
for attr in variadic(attr_list):
name = attrs.get(attr)
if name is not None:
break
value = attrs.get('value')
if name and value is not None:
hidden_inputs[name] = value

View File

@ -5,56 +5,63 @@ import hashlib
import hmac
import json
import os
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
remove_end,
traverse_obj,
unescapeHTML,
)
class GoPlayIE(InfoExtractor):
_VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/]+/[^/]+/|)(?P<display_id>[^/#]+)'
_VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/?#]+/[^/?#]+/|)(?P<id>[^/#]+)'
_NETRC_MACHINE = 'goplay'
_TESTS = [{
'url': 'https://www.goplay.be/video/de-container-cup/de-container-cup-s3/de-container-cup-s3-aflevering-2#autoplay',
'url': 'https://www.goplay.be/video/de-slimste-mens-ter-wereld/de-slimste-mens-ter-wereld-s22/de-slimste-mens-ter-wereld-s22-aflevering-1',
'info_dict': {
'id': '9c4214b8-e55d-4e4b-a446-f015f6c6f811',
'id': '2baa4560-87a0-421b-bffc-359914e3c387',
'ext': 'mp4',
'title': 'S3 - Aflevering 2',
'series': 'De Container Cup',
'season': 'Season 3',
'season_number': 3,
'episode': 'Episode 2',
'episode_number': 2,
'title': 'S22 - Aflevering 1',
'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}',
'series': 'De Slimste Mens ter Wereld',
'episode': 'Episode 1',
'season_number': 22,
'episode_number': 1,
'season': 'Season 22',
},
'params': {'skip_download': True},
'skip': 'This video is only available for registered users',
}, {
'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay',
'url': 'https://www.goplay.be/video/1917',
'info_dict': {
'id': '74e3ed07-748c-49e4-85a0-393a93337dbf',
'id': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
'ext': 'mp4',
'title': 'A Family for the Holidays',
'title': '1917',
'description': r're:Op het hoogtepunt van de Eerste Wereldoorlog krijgen twee jonge .{94}',
},
'params': {'skip_download': True},
'skip': 'This video is only available for registered users',
}, {
'url': 'https://www.goplay.be/video/de-mol/de-mol-s11/de-mol-s11-aflevering-1#autoplay',
'info_dict': {
'id': '03eb8f2f-153e-41cb-9805-0d3a29dab656',
'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee',
'ext': 'mp4',
'title': 'S11 - Aflevering 1',
'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}',
'episode': 'Episode 1',
'series': 'De Mol',
'season_number': 11,
'episode_number': 1,
'season': 'Season 11',
},
'params': {
'skip_download': True,
},
'params': {'skip_download': True},
'skip': 'This video is only available for registered users',
}]
@ -69,27 +76,42 @@ class GoPlayIE(InfoExtractor):
if not self._id_token:
raise self.raise_login_required(method='password')
def _real_extract(self, url):
url, display_id = self._match_valid_url(url).group(0, 'display_id')
webpage = self._download_webpage(url, display_id)
video_data_json = self._html_search_regex(r'<div\s+data-hero="([^"]+)"', webpage, 'video_data')
video_data = self._parse_json(unescapeHTML(video_data_json), display_id).get('data')
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
movie = video_data.get('movie')
if movie:
video_id = movie['videoUuid']
info_dict = {
'title': movie.get('title'),
}
else:
episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False)
video_id = episode['videoUuid']
info_dict = {
'title': episode.get('episodeTitle'),
'series': traverse_obj(episode, ('program', 'title')),
'season_number': episode.get('seasonNumber'),
'episode_number': episode.get('episodeNumber'),
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
nextjs_data = traverse_obj(
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage),
(..., {js_to_json}, {json.loads}, ..., {self._find_json}, ...))
meta = traverse_obj(nextjs_data, (
..., lambda _, v: v['meta']['path'] == urllib.parse.urlparse(url).path, 'meta', any))
video_id = meta['uuid']
info_dict = traverse_obj(meta, {
'title': ('title', {str}),
'description': ('description', {str.strip}),
})
if traverse_obj(meta, ('program', 'subtype')) != 'movie':
for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)):
episode_data = traverse_obj(
season_data, ('videos', lambda _, v: v['videoId'] == video_id, any))
if not episode_data:
continue
episode_title = traverse_obj(
episode_data, 'contextualTitle', 'episodeTitle', expected_type=str)
info_dict.update({
'title': episode_title or info_dict.get('title'),
'series': remove_end(info_dict.get('title'), f' - {episode_title}'),
'season_number': traverse_obj(season_data, ('season', {int_or_none})),
'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})),
})
break
api = self._download_json(
f'https://api.goplay.be/web/v1/videos/long-form/{video_id}',

View File

@ -12,7 +12,7 @@ from ..utils.traversal import traverse_obj
class MixchIE(InfoExtractor):
IE_NAME = 'mixch'
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)'
_VALID_URL = r'https?://mixch\.tv/u/(?P<id>\d+)'
_TESTS = [{
'url': 'https://mixch.tv/u/16943797/live',
@ -74,7 +74,7 @@ class MixchIE(InfoExtractor):
class MixchArchiveIE(InfoExtractor):
IE_NAME = 'mixch:archive'
_VALID_URL = r'https?://(?:www\.)?mixch\.tv/archive/(?P<id>\d+)'
_VALID_URL = r'https?://mixch\.tv/archive/(?P<id>\d+)'
_TESTS = [{
'url': 'https://mixch.tv/archive/421',
@ -116,3 +116,56 @@ class MixchArchiveIE(InfoExtractor):
'formats': self._extract_m3u8_formats(info_json['archiveURL'], video_id),
'thumbnail': traverse_obj(info_json, ('thumbnailURL', {url_or_none})),
}
class MixchMovieIE(InfoExtractor):
IE_NAME = 'mixch:movie'
_VALID_URL = r'https?://mixch\.tv/m/(?P<id>\w+)'
_TESTS = [{
'url': 'https://mixch.tv/m/Ve8KNkJ5',
'info_dict': {
'id': 'Ve8KNkJ5',
'title': '夏☀️\nムービーへのポイントは本イベントに加算されないので配信にてお願い致します🙇🏻\u200d♀️\n#TGCCAMPUS #ミス東大 #ミス東大2024 ',
'ext': 'mp4',
'uploader': 'ミス東大No.5 松藤百香🍑💫',
'uploader_id': '12299174',
'channel_follower_count': int,
'view_count': int,
'like_count': int,
'comment_count': int,
'timestamp': 1724070828,
'uploader_url': 'https://mixch.tv/u/12299174',
'live_status': 'not_live',
'upload_date': '20240819',
},
}, {
'url': 'https://mixch.tv/m/61DzpIKE',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
f'https://mixch.tv/api-web/movies/{video_id}', video_id)
return {
'id': video_id,
'formats': [{
'format_id': 'mp4',
'url': data['movie']['file'],
'ext': 'mp4',
}],
**traverse_obj(data, {
'title': ('movie', 'title', {str}),
'thumbnail': ('movie', 'thumbnailURL', {url_or_none}),
'uploader': ('ownerInfo', 'name', {str}),
'uploader_id': ('ownerInfo', 'id', {int}, {str_or_none}),
'channel_follower_count': ('ownerInfo', 'fan', {int_or_none}),
'view_count': ('ownerInfo', 'view', {int_or_none}),
'like_count': ('movie', 'favCount', {int_or_none}),
'comment_count': ('movie', 'commentCount', {int_or_none}),
'timestamp': ('movie', 'published', {int_or_none}),
'uploader_url': ('ownerInfo', 'id', {lambda x: x and f'https://mixch.tv/u/{x}'}, filter),
}),
'live_status': 'not_live',
}