Compare commits

...

24 Commits

Author SHA1 Message Date
HobbyistDev
201f167204
Merge dfc4769853 into eb15fd5a32 2024-11-17 21:17:34 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
HobbyistDev
dfc4769853 Fix formatting issue 2024-08-02 18:30:11 +09:00
HobbyistDev
694de75a67
Merge branch 'yt-dlp:master' into viu-indonesia-fix-6482-partial 2024-08-02 18:21:30 +09:00
bashonly
b688846068
Merge branch 'master' into viu-indonesia-fix-6482-partial 2024-06-17 10:24:04 -05:00
HobbyistDev
f92dedc92a handle _download_json error correctly
This commit will fallback to webpage extraction if the API url is not found in particular regiom
2024-04-11 07:13:34 +08:00
HobbyistDev
9a6fc7f863
Merge branch 'yt-dlp:master' into viu-indonesia-fix-6482-partial 2024-04-10 07:36:16 +08:00
HobbyistDev
1c04f8d345 merge the ID extraction and the old way extraction
This commit is untested outside `ID` region
2024-04-10 07:34:46 +08:00
HobbyistDev
7733909f74
Merge branch 'yt-dlp:master' into viu-indonesia-fix-6482-partial 2024-04-06 12:03:09 +08:00
HobbyistDev
3f7d3d20bd delete single use variable current_product_subtitle_info 2024-03-01 13:24:37 +08:00
HobbyistDev
e9e2fe84a2 use regex rather than full string match in thumbnail test 2024-03-01 13:19:20 +08:00
HobbyistDev
9dcf080549 update comment 2024-02-26 20:19:46 +08:00
HobbyistDev
b13592aafc Merge branch 'viu-indonesia-fix-6482-partial' of https://github.com/HobbyistDev/yt-dlp into viu-indonesia-fix-6482-partial 2024-02-26 20:15:20 +08:00
HobbyistDev
66d3eb246a
Merge branch 'yt-dlp:master' into viu-indonesia-fix-6482-partial 2024-02-26 20:15:03 +08:00
HobbyistDev
dbb084c001 add comment about possible to bypass geo-blocking 2024-02-26 19:36:26 +08:00
HobbyistDev
1ec2c788bb add note to skip as geo-restricted to Indonesia 2024-02-24 15:56:55 +08:00
HobbyistDev
13879e2cef remove duplicated same extraction process to thumbnails 2024-02-24 15:49:02 +08:00
HobbyistDev
c7a528c198 remove trailing spaces 2024-02-21 18:41:12 +08:00
HobbyistDev
7b9f56ade5 reformat product_detail_json 2024-02-21 18:39:21 +08:00
HobbyistDev
c588ffdede inline remove unneccessary variable stream_urls 2024-02-14 21:25:01 +08:00
HobbyistDev
827821d6ca
Merge branch 'yt-dlp:master' into viu-indonesia-fix-6482-partial 2024-02-14 21:21:46 +08:00
HobbyistDev
1ba1fb1327 [viu:ott] Fix indonesian (and probably malaysian) viu extraction 2024-02-13 20:37:26 +08:00
HobbyistDev
74dc1bf198 [ie/viu] Fix _VALID_URL in ViuOTTIE 2024-02-05 13:29:21 +09:00
4 changed files with 214 additions and 12 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

View File

@ -5,9 +5,11 @@ import urllib.parse
import uuid import uuid
from .common import InfoExtractor from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
merge_dicts,
remove_end, remove_end,
smuggle_url, smuggle_url,
strip_or_none, strip_or_none,
@ -150,7 +152,7 @@ class ViuPlaylistIE(ViuBaseIE):
class ViuOTTIE(InfoExtractor): class ViuOTTIE(InfoExtractor):
IE_NAME = 'viu:ott' IE_NAME = 'viu:ott'
_NETRC_MACHINE = 'viu' _NETRC_MACHINE = 'viu'
_VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P<country_code>[a-z]{2})/(?P<lang_code>[a-z]{2}-[a-z]{2})/vod/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P<country_code>[a-z]{2})/(?P<lang_code>[a-z]{2}(?:-[a-z]{2})?)/vod/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I', 'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I',
'info_dict': { 'info_dict': {
@ -195,6 +197,19 @@ class ViuOTTIE(InfoExtractor):
'noplaylist': False, 'noplaylist': False,
}, },
'skip': 'Geo-restricted to Hong Kong', 'skip': 'Geo-restricted to Hong Kong',
}, {
'url': 'https://www.viu.com/ott/id/id/vod/2221644/Detective-Conan',
'info_dict': {
'id': '2221644',
'ext': 'mp4',
'description': 'md5:b199bcdb07b1e01a03529f155349ddd5',
'duration': 1425,
'series': 'Detective Conan',
'title': 'Detective Conan - Episode 1150',
'episode': 'Detective Conan - Episode 1150',
'episode_number': 1150,
'thumbnail': r're:https?://prod-images\.viu\.com/clip_asset_v6/\d+/\d+/[a-f0-9]+',
},
}] }]
_AREA_ID = { _AREA_ID = {
@ -270,27 +285,43 @@ class ViuOTTIE(InfoExtractor):
url, idata = unsmuggle_url(url, {}) url, idata = unsmuggle_url(url, {})
country_code, lang_code, video_id = self._match_valid_url(url).groups() country_code, lang_code, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id, fatal=False)
json_ld = self._search_json_ld(webpage, video_id, fatal=False)
next_js_data = (self._search_nextjs_data(webpage, video_id, fatal=False) or {}).get('props')
runtime_info = traverse_obj(next_js_data, ('initialState', 'app', 'runtimeInfo'))
query = { query = {
'r': 'vod/ajax-detail', 'r': 'vod/ajax-detail',
'platform_flag_label': 'web', 'platform_flag_label': 'web',
'product_id': video_id, 'product_id': video_id,
} }
area_id = self._AREA_ID.get(country_code.upper()) area_id = self._AREA_ID.get(country_code.upper()) or runtime_info.get('areaId')
if area_id: if area_id:
query['area_id'] = area_id query['area_id'] = area_id
try:
product_data = self._download_json( product_data = self._download_json(
f'http://www.viu.com/ott/{country_code}/index.php', video_id, f'http://www.viu.com/ott/{country_code}/index.php', video_id,
'Downloading video info', query=query)['data'] 'Downloading video info', query=query)['data']
# The `fatal` in `_download_json` didn't prevent json error
# FIXME: probably the error still too broad
except ExtractorError as e:
if not isinstance(e.cause, (json.JSONDecodeError, HTTPError)):
raise
# NOTE: some geo-blocked like https://www.viu.com/ott/sg/en/vod/108599/The-Beauty-Inside actually can bypassed
# on other region (like in ID)
product_data = traverse_obj(
next_js_data, ('pageProps', 'fallback', lambda k, v: v if re.match(r'@"PRODUCT_DETAIL"[^:]+', k) else None),
get_all=False)['data']
video_data = product_data.get('current_product') video_data = product_data.get('current_product')
if not video_data: if not video_data:
self.raise_geo_restricted() self.raise_geo_restricted()
series_id = video_data.get('series_id') series_id = video_data.get('series_id') or traverse_obj(product_data, ('series', 'series_id'))
if self._yes_playlist(series_id, video_id, idata): if self._yes_playlist(series_id, video_id, idata):
series = product_data.get('series') or {} series = product_data.get('series') or traverse_obj(product_data, ('series', 'name')) or {}
product = series.get('product') product = series.get('product')
if product: if product:
entries = [] entries = []
@ -308,7 +339,9 @@ class ViuOTTIE(InfoExtractor):
duration_limit = False duration_limit = False
query = { query = {
'ccs_product_id': video_data['ccs_product_id'], 'ccs_product_id': video_data['ccs_product_id'],
'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or '3', 'language_flag_id': self._LANGUAGE_FLAG.get(lang_code.lower()) or runtime_info.get('languageFlagId') or '3',
'platform_flag_label': 'web',
'countryCode': country_code.upper(),
} }
def download_playback(): def download_playback():
@ -384,7 +417,7 @@ class ViuOTTIE(InfoExtractor):
}) })
title = strip_or_none(video_data.get('synopsis')) title = strip_or_none(video_data.get('synopsis'))
return { return merge_dicts({
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'description': video_data.get('description'), 'description': video_data.get('description'),
@ -395,7 +428,12 @@ class ViuOTTIE(InfoExtractor):
'thumbnail': url_or_none(video_data.get('cover_image_url')), 'thumbnail': url_or_none(video_data.get('cover_image_url')),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
} }, traverse_obj(json_ld, {
'thumbnails': 'thumbnails',
'title': 'title',
'episode': 'episode',
'episode_number': 'episode_number',
}))
class ViuOTTIndonesiaBaseIE(InfoExtractor): class ViuOTTIndonesiaBaseIE(InfoExtractor):