Compare commits

...

5 Commits

Author SHA1 Message Date
Luc Ritchie
6a4c01b0b6
Merge eee4f7163d into eb15fd5a32 2024-11-17 21:24:20 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
Luc Ritchie
eee4f7163d
Handle --flat-playlist when recording multi_video in archive 2023-10-12 06:42:39 -04:00
Luc Ritchie
8f35b886d9
Record multi_video playlists in download archive 2023-10-12 06:33:00 -04:00
4 changed files with 188 additions and 6 deletions

View File

@ -1470,7 +1470,8 @@ class YoutubeDL:
return self.get_output_path(dir_type, filename) return self.get_output_path(dir_type, filename)
def _match_entry(self, info_dict, incomplete=False, silent=False): def _match_entry(self, info_dict, incomplete=False, silent=False):
"""Returns None if the file should be downloaded""" """Returns None if the file should be downloaded, False if the file is already present in
the download archive, or a string describing another reason to skip the file"""
_type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video') _type = 'video' if 'playlist-match-filter' in self.params['compat_opts'] else info_dict.get('_type', 'video')
assert incomplete or _type == 'video', 'Only video result can be considered complete' assert incomplete or _type == 'video', 'Only video result can be considered complete'
@ -1545,6 +1546,7 @@ class YoutubeDL:
format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '), format_field(info_dict, 'id', f'{self._format_screen("%s", self.Styles.ID)}: '),
format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '), format_field(info_dict, 'title', f'{self._format_screen("%s", self.Styles.EMPHASIS)} '),
'has already been recorded in the archive')) 'has already been recorded in the archive'))
ret = False
break_opt, break_err = 'break_on_existing', ExistingVideoReached break_opt, break_err = 'break_on_existing', ExistingVideoReached
else: else:
try: try:
@ -1553,12 +1555,13 @@ class YoutubeDL:
reason, break_opt, break_err = e.msg, 'match_filter', type(e) reason, break_opt, break_err = e.msg, 'match_filter', type(e)
else: else:
break_opt, break_err = 'break_on_reject', RejectedVideoReached break_opt, break_err = 'break_on_reject', RejectedVideoReached
ret = reason
if reason is not None: if reason is not None:
if not silent: if not silent:
self.to_screen('[download] ' + reason) self.to_screen('[download] ' + reason)
if self.params.get(break_opt, False): if self.params.get(break_opt, False):
raise break_err() raise break_err()
return reason return ret
@staticmethod @staticmethod
def add_extra_info(info_dict, extra_info): def add_extra_info(info_dict, extra_info):
@ -1832,6 +1835,7 @@ class YoutubeDL:
self._raise_pending_errors(info_copy) self._raise_pending_errors(info_copy)
if self.params.get('force_write_download_archive', False): if self.params.get('force_write_download_archive', False):
self.record_download_archive(info_copy) self.record_download_archive(info_copy)
ie_result['__write_download_archive'] = self.params.get('force_write_download_archive', False)
return ie_result return ie_result
if result_type == 'video': if result_type == 'video':
@ -1971,7 +1975,9 @@ class YoutubeDL:
common_info = self._playlist_infodict(ie_result, strict=True) common_info = self._playlist_infodict(ie_result, strict=True)
title = common_info.get('playlist') or '<Untitled>' title = common_info.get('playlist') or '<Untitled>'
if self._match_entry(common_info, incomplete=True) is not None: skip_reason = self._match_entry(common_info, incomplete=True)
if skip_reason is not None:
ie_result['__write_download_archive'] = skip_reason is False
return return
self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}') self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}')
@ -2026,6 +2032,7 @@ class YoutubeDL:
self.write_debug('The information of all playlist entries will be held in memory') self.write_debug('The information of all playlist entries will be held in memory')
failures = 0 failures = 0
all_write_download_archive = True
max_failures = self.params.get('skip_playlist_after_errors') or float('inf') max_failures = self.params.get('skip_playlist_after_errors') or float('inf')
for i, (playlist_index, entry) in enumerate(entries): for i, (playlist_index, entry) in enumerate(entries):
if lazy: if lazy:
@ -2059,6 +2066,8 @@ class YoutubeDL:
}, extra)) }, extra))
if not entry_result: if not entry_result:
failures += 1 failures += 1
elif not entry_result.get('__write_download_archive', False):
all_write_download_archive = False
if failures >= max_failures: if failures >= max_failures:
self.report_error( self.report_error(
f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction') f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction')
@ -2073,6 +2082,12 @@ class YoutubeDL:
# Do not set for full playlist # Do not set for full playlist
ie_result.pop('requested_entries') ie_result.pop('requested_entries')
if ie_result['_type'] == 'multi_video' and not failures:
if self.params.get('force_write_download_archive') or (
all_write_download_archive and not self.params.get('simulate')
and not self.params.get('skip_download')):
self.record_download_archive(ie_result)
# Write the updated info to json # Write the updated info to json
if _infojson_written is True and self._write_info_json( if _infojson_written is True and self._write_info_json(
'updated playlist', ie_result, 'updated playlist', ie_result,
@ -2919,7 +2934,9 @@ class YoutubeDL:
info_dict, _ = self.pre_process(info_dict) info_dict, _ = self.pre_process(info_dict)
if self._match_entry(info_dict, incomplete=self._format_fields) is not None: skip_reason = self._match_entry(info_dict, incomplete=self._format_fields)
if skip_reason is not None:
info_dict['__write_download_archive'] = skip_reason is False
return info_dict return info_dict
self.post_extract(info_dict) self.post_extract(info_dict)
@ -3022,6 +3039,7 @@ class YoutubeDL:
assert write_archive.issubset({True, False, 'ignore'}) assert write_archive.issubset({True, False, 'ignore'})
if True in write_archive and False not in write_archive: if True in write_archive and False not in write_archive:
self.record_download_archive(info_dict) self.record_download_archive(info_dict)
info_dict['__write_download_archive'] = True
info_dict['requested_downloads'] = downloaded_formats info_dict['requested_downloads'] = downloaded_formats
info_dict = self.run_all_pps('after_video', info_dict) info_dict = self.run_all_pps('after_video', info_dict)

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))