Compare commits

...

8 Commits

Author SHA1 Message Date
scrat5h
587c458e71
Merge df8c550abe into eb15fd5a32 2024-11-17 21:33:20 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
scrat5h
df8c550abe Revert "[extractor/pornhub] Remove fetch of /video/get_media for formats"
This reverts commit 9ecbbcd844.
2022-11-23 12:34:42 +01:00
scrat5h
9ecbbcd844 [extractor/pornhub] Remove fetch of /video/get_media for formats
Previously URLs that contained `/video/get_media` returned JSON with
available formats.

Some time ago Pornhub seem to removed this endpoint and has started to
return `HTTP Error 403: Forbidden`, see #4298.

Nowadays it seem to serve originally requested html page which, of
course, cannot be parsed with JSON parser. yt-dlp produced WARNING:

```
Failed to parse JSON: Expecting value in '': line 1 column 1 (char 0).
```

Since we are already getting format of the video by other means
(`mpd` or `m3u8`) this change removes fetching of URLs that don't
provide value to us anymore.

Fixes: #5615
2022-11-22 23:03:04 +01:00
scrat5h
6887f87208 [extractor/pornhub] URL to subtitles is relative
`closedCaptionsFile` field on the page is not absolute URL (anymore?)
but relative one.

This change makes extracting subtitles working again.
2022-11-22 23:03:04 +01:00
scrat5h
1138e33ac5 [extractor/pornhub] Add data for tests 2022-11-22 23:03:04 +01:00
scrat5h
bdceb022d0 [extractor/pornhub] uploader in test renamed from Babes to BABES-COM 2022-11-22 23:03:04 +01:00
4 changed files with 185 additions and 6 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

View File

@ -11,6 +11,7 @@ from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
NO_DEFAULT, NO_DEFAULT,
ExtractorError, ExtractorError,
base_url,
clean_html, clean_html,
determine_ext, determine_ext,
format_field, format_field,
@ -23,6 +24,7 @@ from ..utils import (
update_url_query, update_url_query,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin,
) )
@ -137,12 +139,14 @@ class PornHubIE(PornHubBaseIE):
_EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)'] _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
_TESTS = [{ _TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': 'a6391306d050e4547f62b3f485dd9ba9', 'md5': '4d4a4e9178b655776f86cf89ecaf0edf',
'info_dict': { 'info_dict': {
'id': '648719015', 'id': '648719015',
'ext': 'mp4', 'ext': 'mp4',
'thumbnail': r're:^https://.i\.phncdn\.com/videos/201306/28/14084201/original/.*\.jpg',
'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
'uploader': 'Babes', 'uploader': 'BABES-COM',
'uploader_id': '/users/babes-com',
'upload_date': '20130628', 'upload_date': '20130628',
'timestamp': 1372447216, 'timestamp': 1372447216,
'duration': 361, 'duration': 361,
@ -207,11 +211,22 @@ class PornHubIE(PornHubBaseIE):
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a', 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a',
'info_dict': { 'info_dict': {
'id': 'ph601dc30bae19a', 'id': 'ph601dc30bae19a',
'ext': 'mp4',
'uploader': 'Projekt Melody', 'uploader': 'Projekt Melody',
'uploader_id': 'projekt-melody', 'uploader_id': 'projekt-melody',
'upload_date': '20210205', 'upload_date': '20210205',
'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)', 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)',
'thumbnail': r're:https?://.+', 'thumbnail': r're:https?://.+',
'age_limit': 18,
'view_count': int,
'cast': [],
'like_count': int,
'comment_count': int,
'dislike_count': int,
'timestamp': 1612564932,
'duration': 8173,
'categories': list,
'tags': list,
}, },
}, { }, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
@ -317,10 +332,10 @@ class PornHubIE(PornHubBaseIE):
r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
video_id) video_id)
if flashvars: if flashvars:
subtitle_url = url_or_none(flashvars.get('closedCaptionsFile')) subtitle_url = flashvars.get('closedCaptionsFile')
if subtitle_url: if subtitle_url:
subtitles.setdefault('en', []).append({ subtitles.setdefault('en', []).append({
'url': subtitle_url, 'url': urljoin(base_url(url), subtitle_url),
'ext': 'srt', 'ext': 'srt',
}) })
thumbnail = flashvars.get('image_url') thumbnail = flashvars.get('image_url')