Compare commits

...

4 Commits

Author SHA1 Message Date
MyNey
6264fc36a4
Merge 04417e89de into eb15fd5a32 2024-11-17 21:30:26 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
MinePlayersPE
04417e89de [Roblox] Add extractor 2022-10-08 15:45:27 +07:00
4 changed files with 296 additions and 2 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,
@ -1724,6 +1728,7 @@ from .rinsefm import (
RinseFMIE, RinseFMIE,
) )
from .rmcdecouverte import RMCDecouverteIE from .rmcdecouverte import RMCDecouverteIE
from .roblox import RobloxIE
from .rockstargames import RockstarGamesIE from .rockstargames import RockstarGamesIE
from .rokfin import ( from .rokfin import (
RokfinChannelIE, RokfinChannelIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

129
yt_dlp/extractor/roblox.py Normal file
View File

@ -0,0 +1,129 @@
from .common import InfoExtractor
from ..postprocessor.ffmpeg import FFmpegPostProcessor
from ..utils import (
ExtractorError,
extract_attributes,
float_or_none,
get_element_by_id,
get_element_by_class,
get_element_html_by_class,
get_element_html_by_id,
int_or_none,
PostProcessingError,
strip_or_none,
str_or_none,
str_to_int,
traverse_obj,
try_call,
unified_timestamp,
)
class RobloxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?roblox\.com/library/(?P<id>\d+)'
_TESTS = [{
# UGC Audio
'url': 'https://www.roblox.com/library/7910582982/Backrooms-Ambiance-High-Quality',
'md5': '',
'info_dict': {
'id': '7910582982',
'ext': 'ogg',
'title': 'Backrooms Ambiance (High Quality)',
'description': 'Found an actual higher quality of the sound.',
'uploader': 'ChaseDJ549',
'uploader_id': '412014916',
'categories': ['Horror'],
'like_count': int,
'timestamp': 1636142127,
'modified_timestamp': 1656694893
},
}]
def _real_extract(self, url):
asset_id = self._match_id(url)
webpage = self._download_webpage(url, asset_id)
item_container_div = get_element_html_by_id('item-container', webpage)
item_container_attrs = extract_attributes(item_container_div[:item_container_div.find('>')+1])
asset_type = item_container_attrs.get('data-asset-type')
if asset_type and (asset_type not in ('Audio', 'Video')):
raise ExtractorError('This asset is not an audio/video', expected=True)
asset_uploader_id, asset_uploader_name = self._search_regex(
r'>By <a.+href=["\']https?://(?:www\.)?roblox\.com/users/(?P<id>\d+)[^"\']*["\'][^>]*>@?(?P<name>\w+)</a',
webpage, 'asset creator', fatal=False, group=('id', 'name'))
is_logged_out = not self._get_cookies('https://roblox.com').get('.ROBLOSECURITY')
toolbox_result = traverse_obj(
self._download_json(f'https://apis.roblox.com/toolbox-service/v1/items/details', asset_id, query={'assetIds': asset_id},
note='Downloading extra metadata JSON', errnote=False if is_logged_out else 'Unable to download extra metadata JSON',
fatal=False),
('data', ...), default={}, expected_type=dict, get_all=False)
toolbox_asset_data = toolbox_result.get('asset') or {}
toolbox_creator_data = toolbox_result.get('creator') or {}
toolbox_audio_data = toolbox_asset_data.get('audioDetails') or {}
info_dict = {
'id': asset_id,
'title': toolbox_asset_data.get('name') or item_container_attrs.get('data-item-name'),
'uploader': toolbox_creator_data.get('name') or asset_uploader_name,
'uploader_id': str_or_none(toolbox_creator_data.get('id')) or asset_uploader_id,
# TODO: Sound effects have separate kinds of categories
'categories': toolbox_asset_data.get('assetGenres') or [strip_or_none(get_element_by_class('item-genre', webpage))],
'like_count': str_to_int(extract_attributes(get_element_html_by_class('favoriteCount', webpage)).get('title')),
'timestamp': unified_timestamp(toolbox_asset_data.get('createdUtc')),
'modified_timestamp': unified_timestamp(toolbox_asset_data.get('updatedUtc')), # TODO: Extract from webpage
'track': toolbox_audio_data.get('title'),
'artist': toolbox_audio_data.get('artist'),
'genre': try_call(lambda: toolbox_audio_data['musicGenre'].capitalize())
}
cdn_result = self._download_json(
f'https://assetdelivery.roblox.com/v1/assetId/{asset_id}',
asset_id, note='Downloading file data JSON', headers={
'Accept': 'application/json',
'roblox-browser-asset-request': 'true'
})
asset_file_url = cdn_result.get('location')
if not asset_file_url:
if asset_type == 'Audio':
media_play_icon_div = get_element_html_by_class('MediaPlayerIcon')
if not media_play_icon_div:
self.raise_no_formats('This audio is unavailable', expected=True, video_id=asset_id)
asset_file_url = self._search_regex(r'data-mediathumb-url=["\']https?://[^"\']+["\']', media_play_icon_div, 'audio preview URL')
elif is_logged_out: # assetdelivery API randomly requires auth cookies
self.raise_login_required(metadata_available=True)
else:
self.raise_no_formats(
traverse_obj(cdn_result, ('errors', ..., 'message'), default='Unable to fetch asset', expected_type=str, get_all=False),
video_id=asset_id)
if asset_file_url:
# Assets have no file extension and use binary/octet-stream as Content-Type
pp = FFmpegPostProcessor(self._downloader)
self.to_screen(f'{asset_id}: Checking file format with ffprobe')
try:
metadata = pp.get_metadata_object(asset_file_url)
except PostProcessingError as err:
raise ExtractorError(err.msg, expected=True)
v_stream = a_stream = {}
for stream in metadata['streams']:
if stream['codec_type'] == 'video':
v_stream = stream
elif stream['codec_type'] == 'audio':
a_stream = stream
info_dict['formats'] = [{
'url': asset_file_url,
'ext': 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name'].split(',')[-1],
'vcodec': v_stream.get('codec_name'),
'acodec': a_stream.get('codec_name'),
'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000),
'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000),
'abr': int_or_none(a_stream.get('bit_rate'), scale=1000),
'height': int_or_none(v_stream.get('height')),
'width': int_or_none(v_stream.get('width')),
'filesize': float_or_none(metadata['format'].get('size'))
}]
return info_dict