Compare commits

...

17 Commits

Author SHA1 Message Date
Jesse Millwood
12dcdc493c
Merge bbdbc7111a into eb15fd5a32 2024-11-17 21:24:20 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
Jesse Millwood
bbdbc7111a Update yt_dlp/extractor/fosdem.py
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
2024-02-07 14:26:44 -05:00
Jesse Millwood
354c1f16ec [extractors/fosdem] Replace descriptions in tests with md5sums 2024-02-07 14:26:44 -05:00
Jesse Millwood
f0dada1643 Update yt_dlp/extractor/fosdem.py
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
2024-02-07 14:26:44 -05:00
Jesse Millwood
567bd3a83e
Update supportedsites.md
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
2024-02-07 13:40:09 -05:00
Jesse Millwood
7187056b69 [extractors/fosdem] flake8 cleanup 2023-05-07 07:59:15 -04:00
Jesse Millwood
9a3f1a2a4d [extractors/fosdem] Made style changes 2023-05-06 07:40:52 -04:00
Jesse Millwood
e4e7312e8e [extractor/fosdem] Added Fosdem to supported sites 2023-05-06 07:08:20 -04:00
Jesse Millwood
31f9edb502 [extractor/fosdem] Added playlist support 2023-05-06 07:08:14 -04:00
Jesse Millwood
e15cbfb217 [extractor/fosdem] Added release date and cast to tests 2023-05-06 07:08:07 -04:00
Jesse Millwood
03e4ca498a [extractor/fosdem] Move parsing logic 2023-05-06 07:07:55 -04:00
Jesse Millwood
a1a330cd9c [extractor/fosdem] Add test that needs the re.DOTALL regex for description 2023-05-06 07:07:45 -04:00
Jesse Millwood
771fd0f0cc [extractor/fosdem] Include year 2023-05-06 07:07:35 -04:00
Jesse Millwood
36a1f6294c [extractor/fosdem] Use re.DOTALL for html search for description 2023-05-06 07:07:22 -04:00
Jesse Millwood
33aac01e30 [extractor/fosdem] Added FOSDEM extractor 2023-05-06 07:06:58 -04:00
4 changed files with 273 additions and 2 deletions

View File

@ -663,6 +663,9 @@ from .floatplane import (
from .folketinget import FolketingetIE from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE from .footyroom import FootyRoomIE
from .formula1 import Formula1IE from .formula1 import Formula1IE
from .fosdem import (
FosdemIE
)
from .fourtube import ( from .fourtube import (
FourTubeIE, FourTubeIE,
FuxIE, FuxIE,
@ -946,6 +949,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

104
yt_dlp/extractor/fosdem.py Normal file
View File

@ -0,0 +1,104 @@
from .common import InfoExtractor
import re
class FosdemIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:archive\.)?fosdem\.org/(?P<year>\d{4})/schedule/(?P<type>track|event)/(?P<id>[\w.-]+)'
_TESTS = [
{
'url': 'https://archive.fosdem.org/2022/schedule/event/firmware_updates_for_opnsense_and_pfsense/',
'info_dict': {
'id': 'firmware_updates_for_opnsense_and_pfsense',
'ext': 'webm',
'title': 'Firmware updates for OPNsense and pfSense with fwupd/LVFS',
'thumbnail': None,
'release_date': '2022',
'cast': ['Norbert Kamiński'],
'uploader': 'FOSDEM',
'description': 'md5:06a533c1dd130b9b9aa75a8c50c2625f',
}
},
{
'url': 'https://fosdem.org/2023/schedule/event/microkernel2023/',
'info_dict': {
'id': 'microkernel2023',
'ext': 'webm',
'title': 'The Microkernel Landscape in 2023',
'thumbnail': None,
'release_date': '2023',
'uploader': 'FOSDEM',
'cast': ['Martin Děcký'],
'description': 'md5:dd38c1219fe9cc4aa18b2ef51f70f24c'
}
},
{
'url': 'https://fosdem.org/2023/schedule/event/hwacceluk/',
'info_dict': {
'id': 'hwacceluk',
'ext': 'webm',
'title': 'Hardware acceleration for Unikernels',
'thumbnail': None,
'release_date': '2023',
'cast': ['Anastassios Nanos', 'Charalampos Mainas'],
'uploader': 'FOSDEM',
'description': 'md5:0e4d502d9aadd42d844407b49fab276c'
}
},
{
'url': 'https://fosdem.org/2023/schedule/track/microkernel_and_component_based_os/',
'playlist_count': 11,
'info_dict': {
'id': 'microkernel_and_component_based_os',
'title': 'Microkernel and Component-based OS devroom',
}
}
]
def _real_extract(self, url):
video_id, url_type, year = self._match_valid_url(url).group('id', 'type', 'year')
webpage = self._download_webpage(url, video_id)
title_rgx = r'<div id=\"pagetitles\">\n\s+<h1>(.+?)</h1>'
title = self._html_search_regex(title_rgx, webpage, 'title') \
or self._og_search_title(webpage)
if url_type == 'event':
evnt_blurb_rgx = r'<div class=\"event-blurb\">\n*(?P<blurb>(<div class=\"event-abstract\">(<p>(.+?)</p>\n*)+</div>)+\n*(<div class=\"event-description\">(<p>(.+?)</p>\n*)*</div>))+\n*</div>'
evnt_blurb = self._html_search_regex(evnt_blurb_rgx,
webpage,
'event blurb',
group='blurb',
flags=re.DOTALL,
fatal=False)
description = evnt_blurb
video_url_rgx = r'<li><a href=\"(https://video.fosdem.org/[0-9]{4}/.+)\">'
video_url = self._html_search_regex(video_url_rgx,
webpage,
'video url')
cast_rgx = r'<td><a href=\"/[0-9]+/schedule/speaker/[a-z_]+/\">(?P<speaker>\w+ \w+)</a></td>'
cast = re.findall(cast_rgx, webpage, flags=re.UNICODE) or []
return {
'id': video_id,
'title': title,
'description': description,
'uploader': 'FOSDEM',
'url': video_url,
'thumbnail': None,
'release_date': year,
'cast': cast,
'webpage_url': url,
}
elif url_type == 'track':
events_rgx = r'<td><a href=\"(?P<event>/[0-9]+/schedule/event/[a-z0-9]+/)'
events_slugs = re.findall(events_rgx, webpage) or []
if len(events_slugs) > 0:
events_urls = ['https://fosdem.org' + slug for slug in events_slugs]
entries = []
for event_url in events_urls:
entries.append(self.url_result(event_url, 'Fosdem'))
return self.playlist_result(entries,
playlist_id=video_id,
playlist_title=title,
playlist_description=None)
else:
print(f'The {url_type} is not supported')

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))