Compare commits

...

14 Commits

Author SHA1 Message Date
N/Ame
60683f44eb
Merge 8631ff86d9 into eb15fd5a32 2024-11-17 17:03:36 +01:00
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
grqx_wsl
8631ff86d9 fix: _parse_json needs video_id 2024-10-23 22:50:34 +13:00
grqx_wsl
eb03632cc7 do not extract storyboard in extract_formats 2024-10-23 21:10:05 +13:00
grqx_wsl
1cf870df6a Merge remote-tracking branch 'upstream/master' into ie/bilibili/storyboards 2024-10-23 21:08:24 +13:00
grqx_wsl
6f5a908dff fix heatmap extraction 2024-10-23 20:45:03 +13:00
grqx_wsl
43c6c434ae make storyboard extraction non-fatal 2024-10-23 20:40:14 +13:00
grqx_wsl
a032d2b0d5 add heatmap, update tests 2024-10-23 20:26:32 +13:00
grqx_wsl
65a28bd514 add test for storyboard 2024-10-11 06:19:04 +13:00
grqx_wsl
7679b5241e update tests 2024-10-11 05:57:41 +13:00
grqx_wsl
22e5e37c82 extract episode_info in advance 2024-10-11 01:30:07 +13:00
grqx_wsl
47254db76d misc 2024-10-10 21:18:41 +13:00
grqx_wsl
995fc27931 support storyboards 2024-10-10 20:01:20 +13:00
4 changed files with 345 additions and 37 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -33,6 +33,7 @@ from ..utils import (
parse_qs, parse_qs,
parse_resolution, parse_resolution,
qualities, qualities,
sanitize_url,
smuggle_url, smuggle_url,
srt_subtitles_timecode, srt_subtitles_timecode,
str_or_none, str_or_none,
@ -41,6 +42,7 @@ from ..utils import (
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
value,
variadic, variadic,
) )
@ -66,6 +68,84 @@ class BilibiliBaseIE(InfoExtractor):
f'Format(s) {missing_formats} are missing; you have to login or ' f'Format(s) {missing_formats} are missing; you have to login or '
f'become a premium member to download them. {self._login_hint()}') f'become a premium member to download them. {self._login_hint()}')
def _extract_heatmap(self, cid):
heatmap_json = self._download_json(
'https://bvc.bilivideo.com/pbp/data', cid,
note='Downloading heatmap', errnote='Failed to download heatmap', fatal=False,
query={'cid': cid})
if not isinstance(heatmap_json, dict):
return
duration = self._parse_json(heatmap_json['debug'], cid).get('max_time')
step_sec = traverse_obj(heatmap_json, ('step_sec', {int}))
heatmap_data = traverse_obj(heatmap_json, ('events', 'default', {list}))
if not step_sec or not heatmap_data:
return
peak = max(heatmap_data)
if not peak:
return
for idx, heatmap_entry in enumerate(heatmap_data):
start_time = idx * step_sec
end_time = start_time + step_sec
if duration and end_time >= duration:
yield {
'start_time': start_time,
'end_time': duration,
'value': heatmap_entry / peak,
}
break
yield {
'start_time': start_time,
'end_time': end_time,
'value': heatmap_entry / peak,
}
def _extract_storyboard(self, duration, aid=None, bvid=None, cid=None):
if not (video_id := aid or bvid) or not duration:
return
if storyboard_info := traverse_obj(self._download_json(
'https://api.bilibili.com/x/player/videoshot', video_id,
note='Downloading storyboard info', errnote='Failed to download storyboard info',
query=filter_dict({
'index': 1,
'aid': aid,
'bvid': bvid,
'cid': cid,
})), ('data', {lambda v: v if v.get('image') and v.get('index') else None})):
rows, cols = storyboard_info.get('img_x_len'), storyboard_info.get('img_y_len')
fragments = []
last_duration = 0.0
for i, url in enumerate(storyboard_info['image'], start=1):
if not rows or not cols:
fragments.append({'url': sanitize_url(url)})
continue
elif (duration_index := i * rows * cols - 1) < len(storyboard_info['index']) - 1:
current_duration = traverse_obj(storyboard_info, ('index', duration_index))
else:
current_duration = duration
if not current_duration or current_duration <= last_duration or current_duration > duration:
break
fragments.append({
'url': sanitize_url(url),
'duration': current_duration - last_duration if current_duration is not None else None,
})
if fragments:
return {
'format_id': 'sb',
'format_note': 'storyboard',
'ext': 'mhtml',
'protocol': 'mhtml',
'acodec': 'none',
'vcodec': 'none',
'url': 'about:invalid',
'width': storyboard_info.get('img_x_size'),
'height': storyboard_info.get('img_y_size'),
'fps': len(storyboard_info['image']) * rows * cols / duration if rows and cols else None,
'rows': rows,
'columns': cols,
'fragments': fragments,
}
def extract_formats(self, play_info): def extract_formats(self, play_info):
format_names = { format_names = {
r['quality']: traverse_obj(r, 'new_description', 'display_desc') r['quality']: traverse_obj(r, 'new_description', 'display_desc')
@ -287,14 +367,21 @@ class BilibiliBaseIE(InfoExtractor):
cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1) cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1)
for cid, edges in cid_edges.items(): for cid, edges in cid_edges.items():
play_info = self._download_playinfo(video_id, cid, headers=headers) play_info = self._download_playinfo(video_id, cid, headers=headers)
formats = self.extract_formats(play_info)
duration = float_or_none(play_info.get('timelength'), scale=1000)
if storyboard_format := self._extract_storyboard(
duration=duration,
bvid=video_id, cid=cid):
formats.append(storyboard_format)
yield { yield {
**metainfo, **metainfo,
'id': f'{video_id}_{cid}', 'id': f'{video_id}_{cid}',
'title': f'{metainfo.get("title")} - {next(iter(edges.values())).get("title")}', 'title': f'{metainfo.get("title")} - {next(iter(edges.values())).get("title")}',
'formats': self.extract_formats(play_info), 'formats': formats,
'description': f'{json.dumps(edges, ensure_ascii=False)}\n{metainfo.get("description", "")}', 'description': f'{json.dumps(edges, ensure_ascii=False)}\n{metainfo.get("description", "")}',
'duration': float_or_none(play_info.get('timelength'), scale=1000), 'duration': duration,
'subtitles': self.extract_subtitles(video_id, cid), 'subtitles': self.extract_subtitles(video_id, cid),
'heatmap': list(self._extract_heatmap(cid)),
} }
@ -310,7 +397,7 @@ class BiliBiliIE(BilibiliBaseIE):
'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
'uploader_id': '65880958', 'uploader_id': '65880958',
'uploader': '阿滴英文', 'uploader': '阿滴英文',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'duration': 554.117, 'duration': 554.117,
'tags': list, 'tags': list,
'comment_count': int, 'comment_count': int,
@ -319,6 +406,7 @@ class BiliBiliIE(BilibiliBaseIE):
'like_count': int, 'like_count': int,
'view_count': int, 'view_count': int,
'_old_archive_ids': ['bilibili 8903802_part1'], '_old_archive_ids': ['bilibili 8903802_part1'],
'heatmap': [],
}, },
}, { }, {
'note': 'old av URL version', 'note': 'old av URL version',
@ -337,8 +425,9 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int, 'comment_count': int,
'view_count': int, 'view_count': int,
'tags': list, 'tags': list,
'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 1074402_part1'], '_old_archive_ids': ['bilibili 1074402_part1'],
'heatmap': [],
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
@ -356,7 +445,7 @@ class BiliBiliIE(BilibiliBaseIE):
'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
'tags': 'count:10', 'tags': 'count:10',
'timestamp': 1589601697, 'timestamp': 1589601697,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'uploader': '打牌还是打桩', 'uploader': '打牌还是打桩',
'uploader_id': '150259984', 'uploader_id': '150259984',
'like_count': int, 'like_count': int,
@ -366,6 +455,7 @@ class BiliBiliIE(BilibiliBaseIE):
'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
'duration': 90.314, 'duration': 90.314,
'_old_archive_ids': ['bilibili 498159642_part1'], '_old_archive_ids': ['bilibili 498159642_part1'],
'heatmap': list,
}, },
}], }],
}, { }, {
@ -377,7 +467,7 @@ class BiliBiliIE(BilibiliBaseIE):
'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
'tags': 'count:10', 'tags': 'count:10',
'timestamp': 1589601697, 'timestamp': 1589601697,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'uploader': '打牌还是打桩', 'uploader': '打牌还是打桩',
'uploader_id': '150259984', 'uploader_id': '150259984',
'like_count': int, 'like_count': int,
@ -387,6 +477,7 @@ class BiliBiliIE(BilibiliBaseIE):
'description': 'md5:e3c401cf7bc363118d1783dd74068a68', 'description': 'md5:e3c401cf7bc363118d1783dd74068a68',
'duration': 90.314, 'duration': 90.314,
'_old_archive_ids': ['bilibili 498159642_part1'], '_old_archive_ids': ['bilibili 498159642_part1'],
'heatmap': list,
}, },
}, { }, {
'url': 'https://www.bilibili.com/video/av8903802/', 'url': 'https://www.bilibili.com/video/av8903802/',
@ -399,13 +490,14 @@ class BiliBiliIE(BilibiliBaseIE):
'timestamp': 1488353834, 'timestamp': 1488353834,
'uploader_id': '65880958', 'uploader_id': '65880958',
'uploader': '阿滴英文', 'uploader': '阿滴英文',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'duration': 554.117, 'duration': 554.117,
'tags': list, 'tags': list,
'comment_count': int, 'comment_count': int,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'_old_archive_ids': ['bilibili 8903802_part1'], '_old_archive_ids': ['bilibili 8903802_part1'],
'heatmap': [],
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -428,8 +520,9 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int, 'comment_count': int,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 463665680_part1'], '_old_archive_ids': ['bilibili 463665680_part1'],
'heatmap': list,
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
@ -447,8 +540,9 @@ class BiliBiliIE(BilibiliBaseIE):
'uploader_id': '528182630', 'uploader_id': '528182630',
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 893839363_part1'], '_old_archive_ids': ['bilibili 893839363_part1'],
'heatmap': [],
}, },
}, { }, {
'note': 'newer festival video', 'note': 'newer festival video',
@ -465,8 +559,9 @@ class BiliBiliIE(BilibiliBaseIE):
'uploader_id': '8469526', 'uploader_id': '8469526',
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 778246196_part1'], '_old_archive_ids': ['bilibili 778246196_part1'],
'heatmap': list,
}, },
}, { }, {
'note': 'legacy flv/mp4 video', 'note': 'legacy flv/mp4 video',
@ -484,8 +579,9 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int, 'comment_count': int,
'like_count': int, 'like_count': int,
'tags': list, 'tags': list,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 4120229_part4'], '_old_archive_ids': ['bilibili 4120229_part4'],
'heatmap': [],
}, },
'params': {'extractor_args': {'bilibili': {'prefer_multi_flv': ['32']}}}, 'params': {'extractor_args': {'bilibili': {'prefer_multi_flv': ['32']}}},
'playlist_count': 19, 'playlist_count': 19,
@ -514,8 +610,9 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'tags': list, 'tags': list,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 15700301_part1'], '_old_archive_ids': ['bilibili 15700301_part1'],
'heatmap': [],
}, },
}, { }, {
'note': 'interactive/split-path video', 'note': 'interactive/split-path video',
@ -533,7 +630,7 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int, 'comment_count': int,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 292734508_part1'], '_old_archive_ids': ['bilibili 292734508_part1'],
}, },
'playlist_count': 33, 'playlist_count': 33,
@ -552,10 +649,33 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int, 'comment_count': int,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 292734508_part1'], '_old_archive_ids': ['bilibili 292734508_part1'],
'heatmap': [],
}, },
}], }],
}, {
'note': 'storyboard',
'url': 'https://www.bilibili.com/video/av170001/',
'info_dict': {
'id': 'BV17x411w7KC_p1',
'title': '【MV】保加利亚妖王AZIS视频合辑 p01 Хоп',
'ext': 'mhtml',
'upload_date': '20111109',
'uploader_id': '122541',
'view_count': int,
'_old_archive_ids': ['bilibili 170001_part1'],
'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'uploader': '冰封.虾子',
'timestamp': 1320850533,
'comment_count': int,
'tags': ['Hop', '保加利亚妖王', '保加利亚', 'Азис', 'azis', 'mv'],
'description': 'md5:acfd7360b96547f031f7ebead9e66d9e',
'like_count': int,
'duration': 199.4,
'heatmap': list,
},
'params': {'format': 'sb', 'playlist_items': '1'},
}, { }, {
'note': '301 redirect to bangumi link', 'note': '301 redirect to bangumi link',
'url': 'https://www.bilibili.com/video/BV1TE411f7f1', 'url': 'https://www.bilibili.com/video/BV1TE411f7f1',
@ -574,7 +694,8 @@ class BiliBiliIE(BilibiliBaseIE):
'duration': 1183.957, 'duration': 1183.957,
'timestamp': 1571648124, 'timestamp': 1571648124,
'upload_date': '20191021', 'upload_date': '20191021',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'heatmap': [],
}, },
}, { }, {
'note': 'video has subtitles, which requires login', 'note': 'video has subtitles, which requires login',
@ -593,7 +714,7 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': int, 'comment_count': int,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'subtitles': 'count:2', # login required for CC subtitle 'subtitles': 'count:2', # login required for CC subtitle
'_old_archive_ids': ['bilibili 898179753_part1'], '_old_archive_ids': ['bilibili 898179753_part1'],
}, },
@ -729,6 +850,9 @@ class BiliBiliIE(BilibiliBaseIE):
__post_extractor=self.extract_comments(aid)) __post_extractor=self.extract_comments(aid))
else: else:
formats = self.extract_formats(play_info) formats = self.extract_formats(play_info)
formats.append(self._extract_storyboard(
duration=float_or_none(play_info.get('timelength'), scale=1000),
bvid=video_id, cid=cid))
if not traverse_obj(play_info, ('dash')): if not traverse_obj(play_info, ('dash')):
# we only have legacy formats and need additional work # we only have legacy formats and need additional work
@ -773,6 +897,7 @@ class BiliBiliIE(BilibiliBaseIE):
'__post_extractor': self.extract_comments(aid) if idx == 0 else None, '__post_extractor': self.extract_comments(aid) if idx == 0 else None,
} for idx, fragment in enumerate(formats[0]['fragments'])], } for idx, fragment in enumerate(formats[0]['fragments'])],
'duration': float_or_none(play_info.get('timelength'), scale=1000), 'duration': float_or_none(play_info.get('timelength'), scale=1000),
'heatmap': list(self._extract_heatmap(cid)),
} }
else: else:
return { return {
@ -782,6 +907,7 @@ class BiliBiliIE(BilibiliBaseIE):
'chapters': self._get_chapters(aid, cid), 'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, cid), 'subtitles': self.extract_subtitles(video_id, cid),
'__post_extractor': self.extract_comments(aid), '__post_extractor': self.extract_comments(aid),
'heatmap': list(self._extract_heatmap(cid)),
} }
@ -805,7 +931,8 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
'duration': 1420.791, 'duration': 1420.791,
'timestamp': 1320412200, 'timestamp': 1320412200,
'upload_date': '20111104', 'upload_date': '20111104',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'heatmap': list,
}, },
}, { }, {
'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'url': 'https://www.bilibili.com/bangumi/play/ep267851',
@ -824,7 +951,7 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
'duration': 1425.256, 'duration': 1425.256,
'timestamp': 1554566400, 'timestamp': 1554566400,
'upload_date': '20190406', 'upload_date': '20190406',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
}, },
'skip': 'Geo-restricted', 'skip': 'Geo-restricted',
}, { }, {
@ -845,7 +972,8 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
'duration': 1922.129, 'duration': 1922.129,
'timestamp': 1602853860, 'timestamp': 1602853860,
'upload_date': '20201016', 'upload_date': '20201016',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'heatmap': list,
}, },
}] }]
@ -865,6 +993,8 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
headers=headers) headers=headers)
premium_only = play_info.get('code') == -10403 premium_only = play_info.get('code') == -10403
episode_info = traverse_obj(play_info, ('result', 'play_view_business_info', 'episode_info'))
aid, cid = episode_info.get('aid'), episode_info.get('cid')
play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {}
formats = self.extract_formats(play_info) formats = self.extract_formats(play_info)
@ -878,7 +1008,7 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
episode_number, episode_info = next(( episode_number, episode_info = next((
(idx, ep) for idx, ep in enumerate(traverse_obj( (idx, ep) for idx, ep in enumerate(traverse_obj(
bangumi_info, (('episodes', ('section', ..., 'episodes')), ..., {dict})), 1) bangumi_info, (('episodes', ('section', ..., 'episodes')), ..., {dict})), 1)
if str_or_none(ep.get('id')) == episode_id), (1, {})) if str_or_none(ep.get('id')) == episode_id), (1, episode_info))
season_id = bangumi_info.get('season_id') season_id = bangumi_info.get('season_id')
season_number, season_title = season_id and next(( season_number, season_title = season_id and next((
@ -887,8 +1017,10 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
if e.get('season_id') == season_id if e.get('season_id') == season_id
), (None, None)) ), (None, None))
aid = episode_info.get('aid') aid, cid = episode_info.get('aid', aid), episode_info.get('cid', cid)
duration = float_or_none(play_info.get('timelength'), scale=1000)
if storyboard_format := self._extract_storyboard(duration=duration, aid=aid, cid=cid):
formats.append(storyboard_format)
return { return {
'id': episode_id, 'id': episode_id,
'formats': formats, 'formats': formats,
@ -907,10 +1039,11 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
'season': str_or_none(season_title), 'season': str_or_none(season_title),
'season_id': str_or_none(season_id), 'season_id': str_or_none(season_id),
'season_number': season_number, 'season_number': season_number,
'duration': float_or_none(play_info.get('timelength'), scale=1000), 'duration': duration,
'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid), 'subtitles': self.extract_subtitles(episode_id, cid, aid=aid),
'__post_extractor': self.extract_comments(aid), '__post_extractor': self.extract_comments(aid),
'http_headers': {'Referer': url}, 'http_headers': {'Referer': url},
'heatmap': list(self._extract_heatmap(cid)),
} }
@ -948,7 +1081,8 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE):
'duration': 1525.777, 'duration': 1525.777,
'timestamp': 1425074413, 'timestamp': 1425074413,
'upload_date': '20150227', 'upload_date': '20150227',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'heatmap': list,
}, },
}], }],
}] }]
@ -1003,7 +1137,8 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
'duration': 1436.992, 'duration': 1436.992,
'timestamp': 1343185080, 'timestamp': 1343185080,
'upload_date': '20120725', 'upload_date': '20120725',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'heatmap': list,
}, },
}], }],
}] }]
@ -1037,10 +1172,14 @@ class BilibiliCheeseBaseIE(BilibiliBaseIE):
query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1}, query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1},
headers=self._HEADERS, note='Downloading playinfo')['data'] headers=self._HEADERS, note='Downloading playinfo')['data']
formats = self.extract_formats(play_info)
duration = traverse_obj(episode_info, ('duration', {int_or_none}))
if storyboard_format := self._extract_storyboard(duration=duration, aid=aid, cid=cid):
formats.append(storyboard_format)
return { return {
'id': str_or_none(ep_id), 'id': str_or_none(ep_id),
'episode_id': str_or_none(ep_id), 'episode_id': str_or_none(ep_id),
'formats': self.extract_formats(play_info), 'formats': formats,
'extractor_key': BilibiliCheeseIE.ie_key(), 'extractor_key': BilibiliCheeseIE.ie_key(),
'extractor': BilibiliCheeseIE.IE_NAME, 'extractor': BilibiliCheeseIE.IE_NAME,
'webpage_url': f'https://www.bilibili.com/cheese/play/ep{ep_id}', 'webpage_url': f'https://www.bilibili.com/cheese/play/ep{ep_id}',
@ -1048,7 +1187,7 @@ class BilibiliCheeseBaseIE(BilibiliBaseIE):
'episode': ('title', {str}), 'episode': ('title', {str}),
'title': {lambda v: v and join_nonempty('index', 'title', delim=' - ', from_dict=v)}, 'title': {lambda v: v and join_nonempty('index', 'title', delim=' - ', from_dict=v)},
'alt_title': ('subtitle', {str}), 'alt_title': ('subtitle', {str}),
'duration': ('duration', {int_or_none}), 'duration': {value(duration)},
'episode_number': ('index', {int_or_none}), 'episode_number': ('index', {int_or_none}),
'thumbnail': ('cover', {url_or_none}), 'thumbnail': ('cover', {url_or_none}),
'timestamp': ('release_date', {int_or_none}), 'timestamp': ('release_date', {int_or_none}),
@ -1061,6 +1200,7 @@ class BilibiliCheeseBaseIE(BilibiliBaseIE):
'subtitles': self.extract_subtitles(ep_id, cid, aid=aid), 'subtitles': self.extract_subtitles(ep_id, cid, aid=aid),
'__post_extractor': self.extract_comments(aid), '__post_extractor': self.extract_comments(aid),
'http_headers': self._HEADERS, 'http_headers': self._HEADERS,
'heatmap': list(self._extract_heatmap(cid)),
} }
def _download_season_info(self, query_key, video_id): def _download_season_info(self, query_key, video_id):
@ -1086,8 +1226,9 @@ class BilibiliCheeseIE(BilibiliCheeseBaseIE):
'duration': 221, 'duration': 221,
'timestamp': 1695549606, 'timestamp': 1695549606,
'upload_date': '20230924', 'upload_date': '20230924',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'view_count': int, 'view_count': int,
'heatmap': list,
}, },
}] }]
@ -1119,8 +1260,9 @@ class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE):
'duration': 221, 'duration': 221,
'timestamp': 1695549606, 'timestamp': 1695549606,
'upload_date': '20230924', 'upload_date': '20230924',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'view_count': int, 'view_count': int,
'heatmap': list,
}, },
}], }],
'params': {'playlist_items': '1'}, 'params': {'playlist_items': '1'},
@ -1492,6 +1634,7 @@ class BilibiliPlaylistIE(BilibiliSpaceListBaseIE):
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'_old_archive_ids': ['bilibili 687146339_part1'], '_old_archive_ids': ['bilibili 687146339_part1'],
'heatmap': [],
}, },
'params': {'noplaylist': True}, 'params': {'noplaylist': True},
}, { }, {
@ -1686,8 +1829,9 @@ class BiliBiliSearchIE(SearchInfoExtractor):
'comment_count': int, 'comment_count': int,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:https?://.*\.(?:jpg|jpeg|png)$',
'_old_archive_ids': ['bilibili 988222410_part1'], '_old_archive_ids': ['bilibili 988222410_part1'],
'heatmap': [],
}, },
}], }],
}] }]
@ -1734,7 +1878,7 @@ class BilibiliAudioIE(BilibiliAudioBaseIE):
'id': '1003142', 'id': '1003142',
'ext': 'm4a', 'ext': 'm4a',
'title': '【tsukimi】YELLOW / 神山羊', 'title': '【tsukimi】YELLOW / 神山羊',
'artist': 'tsukimi', 'artists': ['tsukimi'],
'comment_count': int, 'comment_count': int,
'description': 'YELLOW的mp3版', 'description': 'YELLOW的mp3版',
'duration': 183, 'duration': 183,
@ -1746,7 +1890,7 @@ class BilibiliAudioIE(BilibiliAudioBaseIE):
'thumbnail': r're:^https?://.+\.jpg', 'thumbnail': r're:^https?://.+\.jpg',
'timestamp': 1564836614, 'timestamp': 1564836614,
'upload_date': '20190803', 'upload_date': '20190803',
'uploader': 'tsukimi-つきみぐ', 'uploader': '十六夜tsukimiつきみぐ',
'view_count': int, 'view_count': int,
}, },
} }
@ -1801,10 +1945,10 @@ class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
'url': 'https://www.bilibili.com/audio/am10624', 'url': 'https://www.bilibili.com/audio/am10624',
'info_dict': { 'info_dict': {
'id': '10624', 'id': '10624',
'title': '每日新曲推荐每日11:00更新', 'title': '新曲推荐',
'description': '每天11:00更新为你推送最新音乐', 'description': '每天11:00更新为你推送最新音乐',
}, },
'playlist_count': 19, 'playlist_mincount': 10,
} }
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))