Merge 0588bd7c82 into eb15fd5a32

[ie/kenh14] Add extractor (#3996 )
Closes #3937 Authored by: krichbanana, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-25 00:31:26 +01:00 · 2024-11-17 21:14:56 +05:30 · 2024-11-17 14:12:26 +00:00 · 2024-11-17 13:32:12 +01:00 · 2024-05-30 22:36:11 -05:00 · 2024-05-10 16:01:57 +02:00
4 changed files with 280 additions and 2 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -510,6 +510,7 @@ from .dfb import DFBIE
 from .dhm import DHMIE
 from .digitalconcerthall import DigitalConcertHallIE
 from .digiteka import DigitekaIE
 from .digiview import DigiviewIE
 from .discogs import DiscogsReleasePlaylistIE
 from .disney import DisneyIE
 from .dispeak import DigitallySpeakingIE
@ -946,6 +947,10 @@ from .kaltura import KalturaIE
 from .kankanews import KankaNewsIE
 from .karaoketv import KaraoketvIE
 from .kelbyone import KelbyOneIE
 from .kenh14 import (
    Kenh14PlaylistIE,
    Kenh14VideoIE,
 )
 from .khanacademy import (
    KhanAcademyIE,
    KhanAcademyUnitIE,
--- a/yt_dlp/extractor/chaturbate.py
+++ b/yt_dlp/extractor/chaturbate.py
@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
            'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
        }
-    def _extract_from_webpage(self, video_id, tld):
+    def _extract_from_html(self, video_id, tld):
        webpage = self._download_webpage(
            f'https://chaturbate.{tld}/{video_id}/', video_id,
            headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
    def _real_extract(self, url):
        video_id, tld = self._match_valid_url(url).group('id', 'tld')
-        return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld)
+        return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)
--- a/yt_dlp/extractor/digiview.py
+++ b/yt_dlp/extractor/digiview.py
@ -0,0 +1,113 @@
 import urllib.parse
 from yt_dlp.utils import int_or_none
 from ..networking import Request
 from .youtube import YoutubeIE
 class DigiviewIE(YoutubeIE):
    IE_DESC = 'Digiview'
    IE_NAME = 'digiview'
    _VALID_URL = r'https?://(?:www\.)?ladigitale\.dev/digiview/#/v/(?P<id>[0-9a-f]+)'
    _TESTS = [
        {
            # normal video
            'url': 'https://ladigitale.dev/digiview/#/v/663e17b35e979',
            'md5': 'acdf2c99c1e4d67664c9fbc5695986a9',
            'info_dict': {
                'id': 'BaW_jenozKc',
                'ext': 'mp4',
                'title': 'youtube-dl test video "\'/\\ä↭𝕐',
                'channel': 'Philipp Hagemeister',
                'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
                'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
                'upload_date': '20121002',
                'description': 'md5:8fb536f4877b8a7455c2ec23794dbc22',
                'categories': ['Science & Technology'],
                'tags': ['youtube-dl'],
                'duration': 10,
                'view_count': int,
                'like_count': int,
                'availability': 'public',
                'playable_in_embed': True,
                'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
                'live_status': 'not_live',
                'age_limit': 0,
                'comment_count': int,
                'channel_follower_count': int,
                'uploader': 'Philipp Hagemeister',
                'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
                'uploader_id': '@PhilippHagemeister',
                'heatmap': 'count:100',
            }
        },
        {
            # cut video
            'url': 'https://ladigitale.dev/digiview/#/v/663e17f2f3f18',
            'md5': 'acdf2c99c1e4d67664c9fbc5695986a9',
            'info_dict': {
                'id': 'BaW_jenozKc',
                'ext': 'mp4',
                'title': 'youtube-dl test video "\'/\\ä↭𝕐',
                'channel': 'Philipp Hagemeister',
                'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
                'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
                'upload_date': '20121002',
                'description': 'md5:8fb536f4877b8a7455c2ec23794dbc22',
                'categories': ['Science & Technology'],
                'tags': ['youtube-dl'],
                'duration': 3,
                'view_count': int,
                'like_count': int,
                'availability': 'public',
                'playable_in_embed': True,
                'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
                'live_status': 'not_live',
                'age_limit': 0,
                'comment_count': int,
                'channel_follower_count': int,
                'uploader': 'Philipp Hagemeister',
                'uploader_url': 'https://www.youtube.com/@PhilippHagemeister',
                'uploader_id': '@PhilippHagemeister',
                'heatmap': 'count:100',
            }
        },
    ]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage_data = self._download_json(
            Request(
                'https://ladigitale.dev/digiview/inc/recuperer_video.php',
                data=urllib.parse.urlencode({'id': video_id}).encode(),
                method='POST',
            ),
            video_id,
        )
        youtube_ie = YoutubeIE()
        youtube_ie.set_downloader(self._downloader)
        info = youtube_ie._real_extract(webpage_data['videoId'])
        # replace the YouTube metadata by the Digiview one
        info['title'] = webpage_data.get('titre') or info['title']
        info['description'] = webpage_data.get('description') or info['description']
        ffmpeg_args = []
        start_time = int_or_none(webpage_data.get('debut'))
        if start_time is not None and start_time != 0:
            ffmpeg_args.extend(['-ss', str(start_time)])
        end_time = int_or_none(webpage_data.get('fin'))
        if end_time is not None and end_time != info['duration']:
            ffmpeg_args.extend(['-t', str(end_time - (start_time or 0))])
        if ffmpeg_args and self._downloader:
            # cut the video if specified in the Digiview webpage
            ppargs = self._downloader.params.get("postprocessor_args")
            ppargs.setdefault("merger", []).extend(ffmpeg_args)
        return info
--- a/yt_dlp/extractor/kenh14.py
+++ b/yt_dlp/extractor/kenh14.py
@ -0,0 +1,160 @@
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    extract_attributes,
    get_element_by_class,
    get_element_html_by_attribute,
    get_elements_html_by_class,
    int_or_none,
    parse_duration,
    parse_iso8601,
    remove_start,
    strip_or_none,
    unescapeHTML,
    update_url,
    url_or_none,
 )
 from ..utils.traversal import traverse_obj
 class Kenh14VideoIE(InfoExtractor):
    _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
    _TESTS = [{
        'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
        'md5': '1ed67f9c3a1e74acf15db69590cf6210',
        'info_dict': {
            'id': '316173',
            'ext': 'mp4',
            'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
            'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
            'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
            'tags': [],
            'uploader': 'Unbox Therapy',
            'upload_date': '20220517',
            'view_count': int,
            'duration': 722.86,
            'timestamp': 1652764468,
        },
    }, {
        'url': 'https://video.kenh14.vn/video-316174.chn',
        'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
        'info_dict': {
            'id': '316174',
            'ext': 'mp4',
            'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
            'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
            'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
            'tags': [],
            'upload_date': '20220517',
            'view_count': int,
            'duration': 70.04,
            'timestamp': 1652766021,
        },
    }, {
        'url': 'https://video.kenh14.vn/0-344740.chn',
        'md5': 'b843495d5e728142c8870c09b46df2a9',
        'info_dict': {
            'id': '344740',
            'ext': 'mov',
            'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
            'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
            'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
            'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
            'uploader': 'Quang Vũ',
            'upload_date': '20241024',
            'view_count': int,
            'duration': 198.88,
            'timestamp': 1729741590,
        },
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
        direct_url = attrs['data-vid']
        metadata = self._download_json(
            'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
                remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
        formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
        subtitles = {}
        video_data = self._download_json(
            f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
        if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
            fmts, subs = self._extract_m3u8_formats_and_subtitles(
                hls_url, video_id, m3u8_id='hls', fatal=False)
            formats.extend(fmts)
            self._merge_subtitles(subs, target=subtitles)
        if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
            fmts, subs = self._extract_mpd_formats_and_subtitles(
                dash_url, video_id, mpd_id='dash', fatal=False)
            formats.extend(fmts)
            self._merge_subtitles(subs, target=subtitles)
        return {
            **traverse_obj(metadata, {
                'duration': ('duration', {parse_duration}),
                'uploader': ('author', {strip_or_none}),
                'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
                'view_count': ('views', {int_or_none}),
            }),
            'id': video_id,
            'title': (
                traverse_obj(metadata, ('title', {strip_or_none}))
                or clean_html(self._og_search_title(webpage))
                or clean_html(get_element_by_class('vdbw-title', webpage))),
            'formats': formats,
            'subtitles': subtitles,
            'description': (
                clean_html(self._og_search_description(webpage))
                or clean_html(get_element_by_class('vdbw-sapo', webpage))),
            'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
            'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
                {lambda x: x.split(';')}, ..., filter)),
        }
 class Kenh14PlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
    _TESTS = [{
        'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
        'info_dict': {
            'id': '71',
            'title': 'Trần Tình (Naked love) mùa 2',
            'description': 'md5:e9522339304956dea931722dd72eddb2',
            'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
        },
        'playlist_count': 9,
    }, {
        'url': 'https://video.kenh14.vn/playlist/0-72.chn',
        'info_dict': {
            'id': '72',
            'title': 'Lau Lại Đầu Từ',
            'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
            'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
        },
        'playlist_count': 6,
    }]
    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        webpage = self._download_webpage(url, playlist_id)
        category_detail = get_element_by_class('category-detail', webpage) or ''
        embed_info = traverse_obj(
            self._yield_json_ld(webpage, playlist_id),
            (lambda _, v: v['name'] and v['alternateName'], any)) or {}
        return self.playlist_from_matches(
            get_elements_html_by_class('video-item', webpage), playlist_id,
            (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
            getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
            ie=Kenh14VideoIE, playlist_description=(
                clean_html(get_element_by_class('description', category_detail))
                or unescapeHTML(embed_info.get('alternateName'))),
            thumbnail=traverse_obj(
                self._og_search_thumbnail(webpage),
                ({url_or_none}, {update_url(query=None)})))
Author	SHA1	Message	Date
Laurent FAVOLE	8aae43ad1c	Merge `0588bd7c82` into `eb15fd5a32`	2024-11-17 21:14:56 +05:30
krichbanana	eb15fd5a32	[ie/kenh14] Add extractor (#3996 ) Closes #3937 Authored by: krichbanana, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>	2024-11-17 14:12:26 +00:00
sepro	7cecd299e4	[ie/chaturbate] Don't break embed detection (#11565 ) Bugfix for `720b3dc453` Authored by: seproDev	2024-11-17 13:32:12 +01:00
bashonly	0588bd7c82	Merge branch 'master' into digiview-extractor	2024-05-30 22:36:11 -05:00
Laurent FAVOLE	205826121d	Add Digiview extractor	2024-05-10 16:01:57 +02:00