Compare commits

...

7 Commits

Author SHA1 Message Date
bashonly
7d1fa641aa
Merge 8cbf4659bd into eb15fd5a32 2024-11-17 21:17:34 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
bashonly
8cbf4659bd
Merge branch 'yt-dlp:master' into fix/foxsports 2024-07-01 22:38:34 -05:00
bashonly
0bfcbc79b6
Merge branch 'yt-dlp:master' into fix/foxsports 2024-06-12 01:07:42 -05:00
bashonly
d07eb72f6f
fun
Authored by: bashonly
2024-04-05 22:18:44 -05:00
bashonly
734d7bd4a3
[ie/fox] Support more FOX Sports URLs
Authored by: bashonly
2024-04-05 22:05:58 -05:00
5 changed files with 213 additions and 19 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

View File

@ -7,17 +7,19 @@ from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
join_nonempty,
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
traverse_obj, traverse_obj,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none, url_or_none,
urljoin,
) )
class FOXIE(InfoExtractor): class FOXIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?P<id>[\da-fA-F]+)' _VALID_URL = r'https?://(?:www\.)?fox(?:sports)?\.com/(?:watch|replay)/(?!play-)(?P<id>[\w-]+)'
_TESTS = [{ _TESTS = [{
# clip # clip
'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
@ -30,8 +32,9 @@ class FOXIE(InfoExtractor):
'duration': 102, 'duration': 102,
'timestamp': 1504291893, 'timestamp': 1504291893,
'upload_date': '20170901', 'upload_date': '20170901',
'creator': 'FOX', 'creators': ['FOX'],
'series': 'Gotham', # actual series name 'Gotham' is no longer returned by the API
'series': 'Aftermath: Bruce Wayne Develops Into The Dark Knight',
'age_limit': 14, 'age_limit': 14,
'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
@ -39,6 +42,24 @@ class FOXIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, {
# XML endpoint
'url': 'https://www.foxsports.com/watch/fmc-m2du80v5ewz11pbw',
'md5': '5451a633a5ca87b582a4d025df6852e6',
'info_dict': {
'id': 'fmc-m2du80v5ewz11pbw',
'ext': 'mp4',
'title': 'WWE FRIDAY NIGHT SMACKDOWN',
'description': 'From Fiserv Forum in Milwaukee, WI',
'duration': 5367,
'timestamp': 1698176671,
'upload_date': '20231024',
'creators': ['fox-digital'],
'series': 'WWE FRIDAY NIGHT SMACKDOWN',
'age_limit': 0,
'episode': 'WWE FRIDAY NIGHT SMACKDOWN',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, { }, {
# episode, geo-restricted # episode, geo-restricted
'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/',
@ -54,9 +75,13 @@ class FOXIE(InfoExtractor):
}] }]
_GEO_BYPASS = False _GEO_BYPASS = False
_HOME_PAGE_URL = 'https://www.fox.com/' _HOME_PAGE_URL = 'https://www.fox.com/'
_API_KEY = '6E9S4bmcoNnZwVLOHywOv8PJEdu76cM9' _API_KEY = '6E9S4bmcoNnZwVLOHywOv8PJEdu76cM9' # sports: 'cf289e299efdfa39fb6316f259d1de93'
_access_token = None _access_token = None
_device_id = str(uuid.uuid4()) _device_id = str(uuid.uuid4())
_XML_NS = {
'vmap': 'http://www.iab.net/videosuite/vmap',
'yospacenet': 'http://www.yospace.com/extension',
}
def _call_api(self, path, video_id, data=None): def _call_api(self, path, video_id, data=None):
headers = { headers = {
@ -66,7 +91,7 @@ class FOXIE(InfoExtractor):
headers['Authorization'] = 'Bearer ' + self._access_token headers['Authorization'] = 'Bearer ' + self._access_token
try: try:
return self._download_json( return self._download_json(
'https://api3.fox.com/v2.0/' + path, urljoin('https://api3.fox.com/v2.0/', path),
video_id, data=data, headers=headers) video_id, data=data, headers=headers)
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 403: if isinstance(e.cause, HTTPError) and e.cause.status == 403:
@ -100,8 +125,8 @@ class FOXIE(InfoExtractor):
f'previewpassmvpd?device_id={self._device_id}&mvpd_id=TempPass_fbcfox_60min', f'previewpassmvpd?device_id={self._device_id}&mvpd_id=TempPass_fbcfox_60min',
video_id)['accessToken'] video_id)['accessToken']
video = self._call_api('watch', video_id, data=json.dumps({ video = self._call_api('https://prod.api.video.fox/v2.0/watch', video_id, data=json.dumps({
'capabilities': ['drm/widevine', 'fsdk/yo'], 'capabilities': ['fsdk/yo/v3'],
'deviceWidth': 1280, 'deviceWidth': 1280,
'deviceHeight': 720, 'deviceHeight': 720,
'maxRes': '720p', 'maxRes': '720p',
@ -116,13 +141,16 @@ class FOXIE(InfoExtractor):
'privacy': {'us': '1---'}, 'privacy': {'us': '1---'},
'siteSection': '', 'siteSection': '',
'streamType': 'vod', 'streamType': 'vod',
'streamId': video_id}).encode()) 'streamId': video_id,
}).encode())
title = video['name']
release_url = video['url']
try: try:
m3u8_url = self._download_json(release_url, video_id)['playURL'] if playback_url := traverse_obj(video, ('playbackUrl', {url_or_none})):
xml_data = self._download_xml(playback_url, video_id)
stream = xml_data.find('vmap:Extensions/vmap:Extension/yospacenet:Stream', self._XML_NS)
m3u8_url = join_nonempty('https://', stream.get('urlDomain'), stream.get('urlSuffix'), delim='')
else:
m3u8_url = self._download_json(video['url'], video_id)['playURL']
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 403: if isinstance(e.cause, HTTPError) and e.cause.status == 403:
error = self._parse_json(e.cause.response.read().decode(), video_id) error = self._parse_json(e.cause.response.read().decode(), video_id)
@ -130,9 +158,11 @@ class FOXIE(InfoExtractor):
self.raise_geo_restricted(countries=['US']) self.raise_geo_restricted(countries=['US'])
raise ExtractorError(error['description'], expected=True) raise ExtractorError(error['description'], expected=True)
raise raise
formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', if not m3u8_url or m3u8_url == 'https://':
entry_protocol='m3u8_native', m3u8_id='hls') raise ExtractorError('Unable to extract m3u8 url')
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls')
data = try_get( data = try_get(
video, lambda x: x['trackingData']['properties'], dict) or {} video, lambda x: x['trackingData']['properties'], dict) or {}
@ -157,7 +187,7 @@ class FOXIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': video.get('name'),
'formats': formats, 'formats': formats,
'description': video.get('description'), 'description': video.get('description'),
'duration': duration, 'duration': duration,

View File

@ -5,7 +5,7 @@ from ..utils import float_or_none, make_archive_id, smuggle_url
class FoxSportsIE(InfoExtractor): class FoxSportsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?foxsports\.com/watch/(?P<id>[\w-]+)' _VALID_URL = r'https?://(?:www\.)?foxsports\.com/watch/(?P<id>play-[\w-]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.foxsports.com/watch/play-612168c6700004b', 'url': 'https://www.foxsports.com/watch/play-612168c6700004b',
'info_dict': { 'info_dict': {

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))