Compare commits

...

11 Commits

Author SHA1 Message Date
nixxo
419d041efa
Merge 87c8703f90 into eb15fd5a32 2024-11-17 21:33:19 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
nixxo
87c8703f90
Merge branch 'master' into valid-urls-proposal 2023-06-25 10:47:04 +02:00
nixxo
1c3a9489ca
Merge branch 'master' into valid-urls-proposal 2023-01-12 21:50:40 +01:00
nixxo
a73ef8c776
Improved documentation for _VALID_URL
Co-authored-by: dirkf <fieldhouse@gmx.net>
2023-01-12 21:40:41 +01:00
nixxo
518d585eb7
_VALID_URLS > _VALID_URL 2022-12-30 23:56:09 +01:00
nixxo
42d0fba2bf
Implemented solution suggested by @dirkf using variadic 2022-12-30 23:55:32 +01:00
nixxo
7b93fb5ddc
[extractor/rai] using new _VALID_URLS property
- implemented _VALID_URLS property
- deleted a couple of subclasses
- added '_old_archive_ids' to manage back-compatibility with removed subclasses (by @pukkandan)
2022-12-16 11:27:14 +01:00
nixxo
74b5d34794
[extractor/la7] testing implementation of new _VALID_URLS property 2022-12-16 11:26:39 +01:00
nixxo
db96683cf1
[core] _VALID_URLS implementation
- implemented _VALID_URLS extractor property
- improvement suggested by @Grub4k
2022-12-16 11:26:33 +01:00
5 changed files with 222 additions and 54 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,
@ -1665,10 +1669,8 @@ from .rai import (
RaiIE, RaiIE,
RaiNewsIE, RaiNewsIE,
RaiPlayIE, RaiPlayIE,
RaiPlayLiveIE,
RaiPlayPlaylistIE, RaiPlayPlaylistIE,
RaiPlaySoundIE, RaiPlaySoundIE,
RaiPlaySoundLiveIE,
RaiPlaySoundPlaylistIE, RaiPlaySoundPlaylistIE,
RaiSudtirolIE, RaiSudtirolIE,
) )

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

View File

@ -7,10 +7,11 @@ from ..utils import float_or_none, int_or_none, parse_duration, unified_strdate
class LA7IE(InfoExtractor): class LA7IE(InfoExtractor):
IE_NAME = 'la7.it' IE_NAME = 'la7.it'
_VALID_URL = r'''(?x)https?://(?: _VALID_URL = [
(?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video|news)/| r'https?://(?:www\.)?la7\.it/[^/]+/(?:rivedila7|video|news)/.+-(?P<id>\d{5,})',
tg\.la7\.it/repliche-tgla7\?id= r'https?://tg\.la7\.it/repliche-tgla7\?id=(?P<id>\d{5,})',
)(?P<id>.+)''' r'https?://tg\.la7\.it(?:/[^/]+)+-(?P<id>\d{5,})'
]
_TESTS = [{ _TESTS = [{
# single quality video # single quality video
@ -39,7 +40,7 @@ class LA7IE(InfoExtractor):
'formats': 'count:8', 'formats': 'count:8',
}, },
}, { }, {
'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', 'url': 'https://tg.la7.it/repliche-tgla7?id=464601',
'only_matching': True, 'only_matching': True,
}] }]
_HOST = 'https://awsvodpkg.iltrovatore.it' _HOST = 'https://awsvodpkg.iltrovatore.it'

View File

@ -10,6 +10,7 @@ from ..utils import (
filter_dict, filter_dict,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
make_archive_id,
parse_duration, parse_duration,
remove_start, remove_start,
strip_or_none, strip_or_none,
@ -250,7 +251,10 @@ class RaiBaseIE(InfoExtractor):
class RaiPlayIE(RaiBaseIE): class RaiPlayIE(RaiBaseIE):
_VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' _VALID_URL = [
rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)',
r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))',
]
_TESTS = [{ _TESTS = [{
'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'url': 'https://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
@ -321,6 +325,25 @@ class RaiPlayIE(RaiBaseIE):
'timestamp': 1348495020, 'timestamp': 1348495020,
'upload_date': '20120924', 'upload_date': '20120924',
}, },
}, {
# live stream
'url': 'https://www.raiplay.it/dirette/rainews24',
'info_dict': {
'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c',
'display_id': 'rainews24',
'ext': 'mp4',
'title': r're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497',
'uploader': 'Rai News 24',
'creator': 'Rai News 24',
'is_live': True,
'live_status': 'is_live',
'upload_date': '20090502',
'timestamp': 1241276220,
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
'only_matching': True, 'only_matching': True,
@ -354,6 +377,7 @@ class RaiPlayIE(RaiBaseIE):
return { return {
'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'id': remove_start(media.get('id'), 'ContentItem-') or video_id,
'display_id': video_id, 'display_id': video_id,
'_old_archive_ids': [make_archive_id('RaiPlayLive', video_id)] if not re.match(RaiBaseIE._UUID_RE, video_id) else None,
'title': media.get('name'), 'title': media.get('name'),
'alt_title': strip_or_none(alt_title or None), 'alt_title': strip_or_none(alt_title or None),
'description': media.get('description'), 'description': media.get('description'),
@ -377,28 +401,6 @@ class RaiPlayIE(RaiBaseIE):
} }
class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))'
_TESTS = [{
'url': 'http://www.raiplay.it/dirette/rainews24',
'info_dict': {
'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c',
'display_id': 'rainews24',
'ext': 'mp4',
'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497',
'uploader': 'Rai News 24',
'creator': 'Rai News 24',
'is_live': True,
'live_status': 'is_live',
'upload_date': '20090502',
'timestamp': 1241276220,
'formats': 'count:3',
},
'params': {'skip_download': True},
}]
class RaiPlayPlaylistIE(InfoExtractor): class RaiPlayPlaylistIE(InfoExtractor):
_VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
_TESTS = [{ _TESTS = [{
@ -463,7 +465,10 @@ class RaiPlayPlaylistIE(InfoExtractor):
class RaiPlaySoundIE(RaiBaseIE): class RaiPlaySoundIE(RaiBaseIE):
_VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' _VALID_URL = [
rf'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)',
r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)',
]
_TESTS = [{ _TESTS = [{
'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html', 'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
@ -482,7 +487,26 @@ class RaiPlaySoundIE(RaiBaseIE):
'timestamp': 1638346620, 'timestamp': 1638346620,
'upload_date': '20211201', 'upload_date': '20211201',
}, },
'params': {'skip_download': True}, 'params': {
'skip_download': True,
},
}, {
'url': 'https://www.raiplaysound.it/radio2',
'info_dict': {
'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44',
'display_id': 'radio2',
'ext': 'mp4',
'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+',
'thumbnail': r're:https://www\.raiplaysound\.it/dl/img/.+png',
'uploader': 'rai radio 2',
'series': 'Rai Radio 2',
'creator': 'raiplaysound',
'is_live': True,
'live_status': 'is_live',
},
'params': {
'skip_download': 'live',
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -506,6 +530,7 @@ class RaiPlaySoundIE(RaiBaseIE):
**info, **info,
'id': uid or audio_id, 'id': uid or audio_id,
'display_id': audio_id, 'display_id': audio_id,
'_old_archive_ids': [make_archive_id('RaiPlaySoundLive', audio_id)] if not re.match(RaiBaseIE._UUID_RE, audio_id) else None,
'title': traverse_obj(media, 'title', 'episode_title'), 'title': traverse_obj(media, 'title', 'episode_title'),
'alt_title': traverse_obj(media, ('track_info', 'media_name'), expected_type=strip_or_none), 'alt_title': traverse_obj(media, ('track_info', 'media_name'), expected_type=strip_or_none),
'description': media.get('description'), 'description': media.get('description'),
@ -521,26 +546,6 @@ class RaiPlaySoundIE(RaiBaseIE):
} }
class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)'
_TESTS = [{
'url': 'https://www.raiplaysound.it/radio2',
'info_dict': {
'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44',
'display_id': 'radio2',
'ext': 'mp4',
'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+',
'thumbnail': r're:^https://www\.raiplaysound\.it/dl/img/.+\.png',
'uploader': 'rai radio 2',
'series': 'Rai Radio 2',
'creator': 'raiplaysound',
'is_live': True,
'live_status': 'is_live',
},
'params': {'skip_download': True},
}]
class RaiPlaySoundPlaylistIE(InfoExtractor): class RaiPlaySoundPlaylistIE(InfoExtractor):
_VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?' _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?:programmi|playlist|audiolibri)/(?P<id>[^/?#&]+))(?:/(?P<extra_id>[^?#&]+))?'
_TESTS = [{ _TESTS = [{