Compare commits

...

12 Commits

Author SHA1 Message Date
Grabien
4fcea82e46
Merge 3e1c1f2e39 into eb15fd5a32 2024-11-17 21:14:56 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
Grabien
3e1c1f2e39
Merge branch 'yt-dlp:master' into master 2024-07-27 19:09:02 +03:00
Grabien
4ff30140d1
Merge branch 'yt-dlp:master' into master 2024-07-15 12:43:31 +03:00
Grabien
9483bb86df
Merge branch 'yt-dlp:master' into master 2024-05-20 17:25:13 +03:00
Grabien
fe354fa548 Merge branch 'master' of https://github.com/Grabien/yt-dlp 2024-03-20 18:51:52 +02:00
Grabien
35b614b598
Merge branch 'yt-dlp:master' into master 2024-03-20 18:51:42 +02:00
Grabien
974babf965 [ie/senategov] URL parsing fix 2024-03-20 18:50:50 +02:00
pukkandan
c6f0d05213
Update yt_dlp/extractor/senategov.py 2024-03-05 00:59:33 +05:30
Grabien
2f54918f30 [senategod] Committees data update 2024-03-04 19:11:26 +02:00
Grabien
424dd9e061 [senategov] Support for new video paths 2024-03-04 19:01:18 +02:00
4 changed files with 212 additions and 40 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

View File

@ -9,37 +9,37 @@ from ..utils import (
) )
_COMMITTEES = { _COMMITTEES = {
'ag': ('76440', 'http://ag-f.akamaihd.net'), 'ag': ('76440', 'https://ag-f.akamaihd.net', '2036803', 'agriculture'),
'aging': ('76442', 'http://aging-f.akamaihd.net'), 'aging': ('76442', 'https://aging-f.akamaihd.net', '2036801', 'aging'),
'approps': ('76441', 'http://approps-f.akamaihd.net'), 'approps': ('76441', 'https://approps-f.akamaihd.net', '2036802', 'appropriations'),
'arch': ('', 'http://ussenate-f.akamaihd.net'), 'arch': ('', 'https://ussenate-f.akamaihd.net/', '', 'arch'),
'armed': ('76445', 'http://armed-f.akamaihd.net'), 'armed': ('76445', 'https://armed-f.akamaihd.net', '2036800', 'armedservices'),
'banking': ('76446', 'http://banking-f.akamaihd.net'), 'banking': ('76446', 'https://banking-f.akamaihd.net', '2036799', 'banking'),
'budget': ('76447', 'http://budget-f.akamaihd.net'), 'budget': ('76447', 'https://budget-f.akamaihd.net', '2036798', 'budget'),
'cecc': ('76486', 'http://srs-f.akamaihd.net'), 'cecc': ('76486', 'https://srs-f.akamaihd.net', '2036782', 'srs_cecc'),
'commerce': ('80177', 'http://commerce1-f.akamaihd.net'), 'commerce': ('80177', 'https://commerce1-f.akamaihd.net', '2036779', 'commerce'),
'csce': ('75229', 'http://srs-f.akamaihd.net'), 'csce': ('75229', 'https://srs-f.akamaihd.net', '2036777', 'srs_srs'),
'dpc': ('76590', 'http://dpc-f.akamaihd.net'), 'dpc': ('76590', 'https://dpc-f.akamaihd.net', '', 'dpc'),
'energy': ('76448', 'http://energy-f.akamaihd.net'), 'energy': ('76448', 'https://energy-f.akamaihd.net', '2036797', 'energy'),
'epw': ('76478', 'http://epw-f.akamaihd.net'), 'epw': ('76478', 'https://epw-f.akamaihd.net', '2036783', 'environment'),
'ethics': ('76449', 'http://ethics-f.akamaihd.net'), 'ethics': ('76449', 'https://ethics-f.akamaihd.net', '2036796', 'ethics'),
'finance': ('76450', 'http://finance-f.akamaihd.net'), 'finance': ('76450', 'https://finance-f.akamaihd.net', '2036795', 'finance_finance'),
'foreign': ('76451', 'http://foreign-f.akamaihd.net'), 'foreign': ('76451', 'https://foreign-f.akamaihd.net', '2036794', 'foreignrelations'),
'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'), 'govtaff': ('76453', 'https://govtaff-f.akamaihd.net', '2036792', 'hsgac'),
'help': ('76452', 'http://help-f.akamaihd.net'), 'help': ('76452', 'https://help-f.akamaihd.net', '2036793', 'help'),
'indian': ('76455', 'http://indian-f.akamaihd.net'), 'indian': ('76455', 'https://indian-f.akamaihd.net', '2036791', 'indianaffairs'),
'intel': ('76456', 'http://intel-f.akamaihd.net'), 'intel': ('76456', 'https://intel-f.akamaihd.net', '2036790', 'intelligence'),
'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'), 'intlnarc': ('76457', 'https://intlnarc-f.akamaihd.net', '', 'internationalnarcoticscaucus'),
'jccic': ('85180', 'http://jccic-f.akamaihd.net'), 'jccic': ('85180', 'https://jccic-f.akamaihd.net', '2036778', 'jccic'),
'jec': ('76458', 'http://jec-f.akamaihd.net'), 'jec': ('76458', 'https://jec-f.akamaihd.net', '2036789', 'jointeconomic'),
'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'), 'judiciary': ('76459', 'https://judiciary-f.akamaihd.net', '2036788', 'judiciary'),
'rpc': ('76591', 'http://rpc-f.akamaihd.net'), 'rpc': ('76591', 'https://rpc-f.akamaihd.net', '', 'rpc'),
'rules': ('76460', 'http://rules-f.akamaihd.net'), 'rules': ('76460', 'https://rules-f.akamaihd.net', '2036787', 'rules'),
'saa': ('76489', 'http://srs-f.akamaihd.net'), 'saa': ('76489', 'https://srs-f.akamaihd.net', '2036780', 'srs_saa'),
'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'), 'smbiz': ('76461', 'https://smbiz-f.akamaihd.net', '2036786', 'smallbusiness'),
'srs': ('75229', 'http://srs-f.akamaihd.net'), 'srs': ('75229', 'https://srs-f.akamaihd.net', '2031966', 'srs_srs'),
'uscc': ('76487', 'http://srs-f.akamaihd.net'), 'uscc': ('76487', 'https://srs-f.akamaihd.net', '2036781', 'srs_uscc'),
'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'), 'vetaff': ('76462', 'https://vetaff-f.akamaihd.net', '2036785', 'veteransaffairs'),
} }
@ -173,15 +173,23 @@ class SenateGovIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._generic_id(url) display_id = self._generic_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
parse_info = parse_qs(self._search_regex( iframe_src = self._search_regex(
r'<iframe class="[^>"]*streaminghearing[^>"]*"\s[^>]*\bsrc="([^">]*)', webpage, 'hearing URL')) (r'<iframe class="[^>"]*streaminghearing[^>"]*"\s[^>]*\bsrc="([^">]*)',
r'<iframe title="[^>"]*[^>"]*"\s[^>]*\bsrc="([^">]*)'),
stream_num, stream_domain = _COMMITTEES[parse_info['comm'][-1]] webpage, 'hearing URL').replace('&amp;', '&')
parse_info = parse_qs(iframe_src)
comm = parse_info['comm'][-1]
stream_num, stream_domain, stream_id, msl3 = _COMMITTEES[comm]
filename = parse_info['filename'][-1] filename = parse_info['filename'][-1]
formats = self._extract_m3u8_formats( urls_alternatives = [f'https://www-senate-gov-media-srs.akamaized.net/hls/live/{stream_id}/{comm}/{filename}/master.m3u8',
f'https://www-senate-gov-msl3archive.akamaized.net/{msl3}/{filename}_1/master.m3u8',
f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8', f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8',
display_id, ext='mp4') f'{stream_domain}/i/{filename}.mp4/master.m3u8']
for video_url in urls_alternatives:
formats = self._extract_m3u8_formats(video_url, display_id, ext='mp4', fatal=False)
if formats:
break
title = self._html_search_regex( title = self._html_search_regex(
(*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title') (*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title')