[CHZZK] Refactor video extractor

[CHZZK] Refactor and add tests
2024-11-29 10:41:23 +01:00 · 2024-01-17 22:48:27 +03:00 · 2024-01-17 19:52:04 +03:00
1 changed files with 80 additions and 67 deletions
--- a/yt_dlp/extractor/chzzk.py
+++ b/yt_dlp/extractor/chzzk.py
@ -1,60 +1,81 @@
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
-    traverse_obj,
+    int_or_none,
    parse_iso8601,
    traverse_obj,
    url_or_none,
 )
 class CHZZKLiveIE(InfoExtractor):
    IE_NAME = 'chzzk:live'
-    _VALID_URL = r'https?://chzzk\.naver\.com/live/(?P<id>[^/#?]+)'
+    _VALID_URL = r'https?://chzzk\.naver\.com/live/(?P<id>[\da-f]+)'
    _TESTS = [{
        'url': 'https://chzzk.naver.com/live/c68b8ef525fb3d2fa146344d84991753',
-        'only_matching': True,
+        'info_dict': {
            "id": "c68b8ef525fb3d2fa146344d84991753",
            "ext": "mp4",
            'title': str,
            'channel': '진짜도현',
            'channel_id': 'c68b8ef525fb3d2fa146344d84991753',
            'channel_is_verified': False,
            'thumbnail': r're:^https?://.*\.jpg$',
            'timestamp': 1705510344,
            'upload_date': '20240117',
            'live_status': 'is_live',
            'view_count': int,
        },
        'skip': 'The channel is not currently live',
    }]
    def _real_extract(self, url):
        channel_id = self._match_id(url)
-        live_detail_response = self._download_json(
+        live_detail = self._download_json(
-            f'https://api.chzzk.naver.com/service/v1/channels/{channel_id}/live-detail', channel_id,
+            f'https://api.chzzk.naver.com/service/v2/channels/{channel_id}/live-detail', channel_id,
-            note='Downloading channel info',
+            note='Downloading channel info', errnote='Unable to download channel info')['content']
            errnote='Unable to download channel info')
        live_detail = live_detail_response.get('content')
        if live_detail.get('status') == 'CLOSE':
            raise ExtractorError('The channel is not currently live', expected=True)
-        live_playback = self._parse_json(live_detail.get('livePlaybackJson'), channel_id)
+        live_playback = self._parse_json(live_detail['livePlaybackJson'], channel_id)
        thumbnails = []
-        thumbnail_template = traverse_obj(live_playback, ('thumbnail', 'snapshotThumbnailTemplate'))
+        thumbnail_template = traverse_obj(
-        for width in traverse_obj(live_playback, ('thumbnail', 'types')):
+            live_playback, ('thumbnail', 'snapshotThumbnailTemplate', {url_or_none}))
-            thumbnails.append({
+        if thumbnail_template:
-                'id': width,
+            for width in traverse_obj(live_playback, ('thumbnail', 'types', ..., {str})):
-                'url': thumbnail_template.replace('{type}', width),
+                thumbnails.append({
-                'width': int(width),
+                    'id': width,
-            })
+                    'url': thumbnail_template.replace('{type}', width),
                    'width': int_or_none(width),
                })
        formats, subtitles = [], {}
-        for media in live_playback.get('media'):
+        for media in traverse_obj(live_playback, ('media', lambda _, v: url_or_none(v['path']))):
-            media_url = media.get('path')
+            is_low_latency = media.get('mediaId') == 'LLHLS'
-            fmts, subs = self._extract_m3u8_formats_and_subtitles(media_url, channel_id, 'mp4')
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
-            if media.get('mediaId') == 'LLHLS':
+                media['path'], channel_id, 'mp4', fatal=False, live=True,
-                for fmt in fmts:
+                m3u8_id='hls-ll' if is_low_latency else 'hls')
-                    fmt['format_id'] += '-ll'
+            for f in fmts:
                if is_low_latency:
                    f['source_preference'] = -2
                if '-afragalow.stream-audio.stream' in f['format_id']:
                    f['quality'] = -2
            formats.extend(fmts)
-            self._merge_subtitles(subtitles, subs)
+            self._merge_subtitles(subs, target=subtitles)
        return {
            'id': str(channel_id),
            'title': live_detail.get('liveTitle'),
            'thumbnails': thumbnails,
-            'timestamp': parse_iso8601(live_detail.get('openDate')),
+            **traverse_obj(live_detail, {
-            'view_count': live_detail.get('concurrentUserCount'),
+                'timestamp': ('openDate', {lambda d: parse_iso8601(d, ' ')}),
-            'channel': traverse_obj(live_detail, ('channel', 'channelName')),
+                'view_count': ('concurrentUserCount', {int_or_none}),
-            'channel_id': traverse_obj(live_detail, ('channel', 'channelId')),
+                'channel': ('channel', 'channelName', {str}),
-            'channel_is_verified': traverse_obj(live_detail, ('channel', 'verifiedMark')),
+                'channel_id': ('channel', 'channelId', {str}),
                'channel_is_verified': ('channel', 'verifiedMark', {bool}),
            }),
            'is_live': True,
            'formats': formats,
            'subtitles': subtitles,
@ -66,56 +87,48 @@ class CHZZKVideoIE(InfoExtractor):
    _VALID_URL = r'https?://chzzk\.naver\.com/video/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://chzzk.naver.com/video/1754',
-        'only_matching': True,
+        "md5": "b0c0c1bb888d913b93d702b1512c7f06",
        'info_dict': {
            "id": "1754",
            "ext": "mp4",
            'title': '치지직 테스트 방송',
            'channel': '침착맨',
            'channel_id': 'bb382c2c0cc9fa7c86ab3b037fb5799c',
            'channel_is_verified': False,
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 15577,
            'timestamp': 1702970505,
            'upload_date': '20231219',
            'view_count': int,
        },
    }]
    def _real_extract(self, url):
        NS_MAP = {
            'nvod': "urn:naver:vod:2020",
            '': "urn:mpeg:dash:schema:mpd:2011",
        }
        video_id = self._match_id(url)
-        video_meta_response = self._download_json(
+        video_meta = self._download_json(
-            f'https://api.chzzk.naver.com/service/v1/videos/{video_id}', video_id,
+            f'https://api.chzzk.naver.com/service/v2/videos/{video_id}', video_id,
-            note='Downloading video info',
+            note='Downloading video info', errnote='Unable to download video info')['content']
-            errnote='Unable to download video info')
+        formats, subtitles = self._extract_mpd_formats_and_subtitles(
-        video_meta = video_meta_response.get('content')
+            f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{video_meta["videoId"]}', video_id,
        vod_id = video_meta.get('videoId')
        in_key = video_meta.get('inKey')
        playback_xml = self._download_xml(
            f'https://apis.naver.com/neonplayer/vodplay/v1/playback/{vod_id}', video_id,
            query={
-                'key': in_key,
+                'key': video_meta['inKey'],
                'env': 'real',
                'lc': 'en_US',
                'cpl': 'en_US',
-            },
+            }, note='Downloading video playback', errnote='Unable to download video playback')
            note='Downloading video playback',
            errnote='Unable to download video playback')
        thumbnails = []
        i = 0
        for source in playback_xml.iterfind(
            './Period/SupplementalProperty/nvod:Thumbnails/nvod:ThumbnailSet/nvod:Thumbnail/nvod:Source',
            NS_MAP,
        ):
            thumbnails.append({'id': str(i), 'url': source.text.split('?')[0]})
            i += 1
        formats, subtitles = self._parse_mpd_formats_and_subtitles(playback_xml)
        return {
            'id': video_id,
            'title': video_meta.get('videoTitle'),
            'thumbnail': video_meta.get('thumbnailImageUrl'),
            'thumbnails': thumbnails,
            'timestamp': video_meta.get('publishDateAt'),
            'view_count': video_meta.get('readCount'),
            'duration': video_meta.get('duration'),
            'channel': traverse_obj(video_meta, ('channel', 'channelName')),
            'channel_id': traverse_obj(video_meta, ('channel', 'channelId')),
            'channel_is_verified': traverse_obj(video_meta, ('channel', 'verifiedMark')),
            'formats': formats,
            'subtitles': subtitles,
            **traverse_obj(video_meta, {
                'title': ('videoTitle', {str}),
                'thumbnail': ('thumbnailImageUrl', {url_or_none}),
                'timestamp': ('publishDateAt', {lambda t: int_or_none(t / 1000)}),
                'view_count': ('readCount', {int_or_none}),
                'duration': ('duration', {int_or_none}),
                'channel': ('channel', 'channelName', {str}),
                'channel_id': ('channel', 'channelId', {str}),
                'channel_is_verified': ('channel', 'verifiedMark', {bool}),
            }),
        }
Author	SHA1	Message	Date
DmitryScaletta	ef465aeace	[CHZZK] Refactor video extractor	2024-01-17 22:48:27 +03:00
DmitryScaletta	3f9ad7124e	[CHZZK] Refactor and add tests	2024-01-17 19:52:04 +03:00