Compare commits

...

6 Commits

Author SHA1 Message Date
Simon Sawicki
8a53d27c26
Merge fe274adf41 into eb15fd5a32 2024-11-17 21:14:57 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
Simon Sawicki
fe274adf41
Fix deprecation warning?!? lol 2024-01-07 03:02:34 +01:00
Simon Sawicki
96f9bbf392
Proposal v2, with carried state 2024-01-07 02:29:54 +01:00
Simon Sawicki
8dbf2cf66d
Add lazy dict class draft 2023-11-03 20:47:57 +01:00
5 changed files with 353 additions and 2 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,
@ -2617,3 +2621,4 @@ from .zingmp3 import (
) )
from .zoom import ZoomIE from .zoom import ZoomIE
from .zype import ZypeIE from .zype import ZypeIE
from .lazy import LazyExtractorIE

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

33
yt_dlp/extractor/lazy.py Normal file
View File

@ -0,0 +1,33 @@
from .common import InfoExtractor
from ..utils.lazy import lazy_ie, lazy_fields
@lazy_ie
class LazyExtractorIE(InfoExtractor):
IE_NAME = 'lazy'
_VALID_URL = r"lazy://(?P<id>.*)"
def _lazy_webpage(self, storage):
return self._download_webpage(storage.url, storage.id)
@lazy_fields("creator")
def _extract_other(self, storage):
self.to_screen("Extracting something else from webpage")
return {
"creator": storage.webpage.partition(" - ")[0],
}
@lazy_fields("title", "description")
def _extract_website(self, storage):
self.to_screen("Extracting title and description from webpage")
title, _, description = storage.webpage.partition("\n")
return {
"title": title,
"description": description,
}
# Fake downloading the webpage for testing purposes
def _download_webpage(self, url_or_request, video_id, *args, **kwargs):
self.to_screen(f"[{video_id}] Downloaded webpage ({url_or_request})")
return "<creator> - Fake Webpage title\nThis is the description.\n..."

153
yt_dlp/utils/lazy.py Normal file
View File

@ -0,0 +1,153 @@
from __future__ import annotations
import functools
from collections.abc import MutableMapping
from ..utils import try_call
from ..extractor.common import InfoExtractor
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import Callable, Any
class _LazyStorage:
def __init__(self, ie, **kwargs):
self._ie = ie
self._cache = kwargs
def __setattr__(self, name, value, /) -> None:
if name.startswith("_"):
super().__setattr__(name, value)
else:
self._cache[name] = value
def __getattr__(self, name: str):
if name in self._cache:
return self._cache[name]
resolver = getattr(self._ie, f"_lazy_{name}")
result = try_call(resolver, args=(self,))
self._cache[name] = result
return result
def __delattr__(self, name: str) -> None:
if name.startswith("_"):
super().__delattr__(name)
elif name in self._cache:
del self._cache[name]
class _LazyInfoDict(MutableMapping):
def __init__(self, data: dict, lazy: dict, ie: InfoExtractor, **kwargs):
self._data = data
self._lazy = lazy
self._ie = ie
self._storage = _LazyStorage(self._ie, **kwargs)
for key in self._data.keys() & self._lazy.keys():
del self._lazy[key]
self._data.update(dict.fromkeys(self._lazy.keys()))
def __contains__(self, key):
return key in self._data
def __getitem__(self, key):
if key in self._lazy:
compute_func = self._lazy[key]
# updates = try_call(compute_func, args=(self._storage,), expected_type=dict) or {}
updates = compute_func(self._ie, self._storage)
self._data.update(updates)
for field in updates:
self._lazy.pop(field, None)
fields = getattr(compute_func, lazy_fields._field_name, None) or ()
for field in fields:
self._lazy.pop(field, None)
return self._data[key]
def __setitem__(self, key, value):
if key in self._lazy:
del self._lazy[key]
self._data[key] = value
def __delitem__(self, key):
if key in self._lazy:
del self._lazy[key]
del self._data[key]
def __iter__(self):
return iter(self._data)
def __len__(self):
return len(self._data)
def __repr__(self):
if self._lazy:
lazy = ", ".join(f"{key!r}: ..." for key in self._lazy.keys())
data = ", ".join(f"{key!r}: {value!r}" for key, value in self._data.items() if key not in self._lazy)
data = f"{{{data}}}, lazy={{{lazy}}}"
else:
data = f"{self._data!r}"
return f"{type(self).__name__}({data})"
def _default_lazy_extract(self, url):
return dict(id=self._match_id(url))
def lazy_ie(klass: type[InfoExtractor] | None = None, /):
if not klass:
return lazy_ie
_old_extract = klass._real_extract
if _old_extract is InfoExtractor._real_extract:
_old_extract = _default_lazy_extract
lazy_members = {}
for name in dir(klass):
if not name.startswith("_"):
continue
func = getattr(klass, name)
fields = getattr(func, lazy_fields._field_name, None)
if not isinstance(fields, tuple):
continue
for field in fields:
lazy_members[field] = func
@functools.wraps(klass._real_extract)
def _real_extract(self, url):
result = _old_extract(self, url)
assert isinstance(result, dict), 'Lazy extractors need to return a dict'
return _LazyInfoDict(result, lazy_members, self, url=url, **result)
klass._real_extract = _real_extract
return klass
def lazy_fields(*fields: str) -> Callable[[Callable[[Any, _LazyStorage], dict[str, Any]]], Callable[[Any, _LazyStorage], dict[str, Any]]]:
def _lazy_fields(func):
setattr(func, lazy_fields._field_name, fields)
return func
return _lazy_fields
lazy_fields._field_name = "_lazy_fields"
if __name__ == '__main__':
from yt_dlp import YoutubeDL
with YoutubeDL() as ydl:
result = ydl.extract_info("lazy://<URL>", process=False)
assert result
for name in "id", "title", "creator", "description":
print(f"{name:<10} = {result[name]!r}")