Merge 3095d815c9 into eb15fd5a32

[ie/kenh14] Add extractor (#3996 )
Closes #3937 Authored by: krichbanana, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-25 00:31:26 +01:00 · 2024-11-17 21:14:57 +05:30 · 2024-11-17 14:12:26 +00:00 · 2024-11-17 13:32:12 +01:00 · 2024-10-19 18:22:53 +00:00 · 2024-10-19 17:59:18 +00:00
7 changed files with 538 additions and 170 deletions
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -3541,7 +3541,8 @@ class YoutubeDL:
                                     'writing DASH m4a. Only some players support this container',
                                     FFmpegFixupM4aPP)
                        ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts')
-                                     or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None,
+                                     or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None
+                                     or downloader == 'niconico_live',
                                     'Possible MPEG-TS in MP4 container or malformed AAC timestamps',
                                     FFmpegFixupM3u8PP)
                        ffmpeg_fixup(downloader == 'dashsegments'
--- a/yt_dlp/downloader/niconico.py
+++ b/yt_dlp/downloader/niconico.py
@ -1,12 +1,22 @@
+import contextlib
 import json
+import math
 import threading
 import time

 from . import get_suitable_downloader
 from .common import FileDownloader
 from .external import FFmpegFD
+from ..downloader.fragment import FragmentFD
 from ..networking import Request
-from ..utils import DownloadError, str_or_none, try_get
+from ..networking.exceptions import network_exceptions
+from ..utils import (
+    DownloadError,
+    RetryManager,
+    str_or_none,
+    traverse_obj,
+    urljoin,
+)


 class NiconicoDmcFD(FileDownloader):
@ -56,85 +66,184 @@ class NiconicoDmcFD(FileDownloader):
        return success


-class NiconicoLiveFD(FileDownloader):
-    """ Downloads niconico live without being stopped """
+class NiconicoLiveFD(FragmentFD):
+    """ Downloads niconico live/timeshift VOD """

-    def real_download(self, filename, info_dict):
-        video_id = info_dict['video_id']
-        ws_url = info_dict['url']
-        ws_extractor = info_dict['ws']
-        ws_origin_host = info_dict['origin']
-        live_quality = info_dict.get('live_quality', 'high')
-        live_latency = info_dict.get('live_latency', 'high')
-        dl = FFmpegFD(self.ydl, self.params or {})
+    _PER_FRAGMENT_DOWNLOAD_RATIO = 0.1
+    _WEBSOCKET_RECONNECT_DELAY = 10

-        new_info_dict = info_dict.copy()
-        new_info_dict.update({
-            'protocol': 'm3u8',
-        })
+    @contextlib.contextmanager
+    def _ws_context(self, info_dict):
+        """ Hold a WebSocket object and release it when leaving """

-        def communicate_ws(reconnect):
-            if reconnect:
-                ws = self.ydl.urlopen(Request(ws_url, headers={'Origin': f'https://{ws_origin_host}'}))
-                if self.ydl.params.get('verbose', False):
-                    self.to_screen('[debug] Sending startWatching request')
-                ws.send(json.dumps({
-                    'type': 'startWatching',
-                    'data': {
-                        'stream': {
-                            'quality': live_quality,
-                            'protocol': 'hls+fmp4',
-                            'latency': live_latency,
-                            'chasePlay': False,
-                        },
-                        'room': {
-                            'protocol': 'webSocket',
-                            'commentable': True,
-                        },
-                        'reconnect': True,
+        video_id = info_dict['id']
+        format_id = info_dict['format_id']
+        live_latency = info_dict['downloader_options']['live_latency']
+        ws_url = info_dict['downloader_options']['ws_url']
+
+        self.ws = None
+
+        self.m3u8_lock = threading.Event()
+        self.m3u8_url = None
+
+        def communicate_ws():
+            self.ws = self.ydl.urlopen(Request(ws_url, headers=info_dict.get('http_headers')))
+            if self.ydl.params.get('verbose', False):
+                self.write_debug('Sending HLS server request')
+            self.ws.send(json.dumps({
+                'type': 'startWatching',
+                'data': {
+                    'stream': {
+                        'quality': format_id,
+                        'protocol': 'hls',
+                        'latency': live_latency,
+                        'chasePlay': False,
                    },
-                }))
-            else:
-                ws = ws_extractor
-            with ws:
+                    'room': {
+                        'protocol': 'webSocket',
+                        'commentable': True,
+                    },
+                },
+            }))
+            with self.ws:
                while True:
-                    recv = ws.recv()
+                    recv = self.ws.recv()
                    if not recv:
                        continue
                    data = json.loads(recv)
-                    if not data or not isinstance(data, dict):
+                    if not isinstance(data, dict):
                        continue
                    if data.get('type') == 'ping':
                        # pong back
-                        ws.send(r'{"type":"pong"}')
-                        ws.send(r'{"type":"keepSeat"}')
+                        self.ws.send(r'{"type":"pong"}')
+                        self.ws.send(r'{"type":"keepSeat"}')
+                    elif data.get('type') == 'stream':
+                        self.m3u8_url = data['data']['uri']
+                        self.m3u8_lock.set()
                    elif data.get('type') == 'disconnect':
                        self.write_debug(data)
-                        return True
+                        return
                    elif data.get('type') == 'error':
                        self.write_debug(data)
-                        message = try_get(data, lambda x: x['body']['code'], str) or recv
-                        return DownloadError(message)
+                        message = traverse_obj(data, ('data', 'code')) or recv
+                        raise DownloadError(message)
                    elif self.ydl.params.get('verbose', False):
                        if len(recv) > 100:
                            recv = recv[:100] + '...'
-                        self.to_screen(f'[debug] Server said: {recv}')
+                        self.write_debug(f'Server said: {recv}')
+
+        stopped = threading.Event()

        def ws_main():
-            reconnect = False
-            while True:
+            while not stopped.is_set():
                try:
-                    ret = communicate_ws(reconnect)
-                    if ret is True:
-                        return
-                except BaseException as e:
-                    self.to_screen('[{}] {}: Connection error occured, reconnecting after 10 seconds: {}'.format('niconico:live', video_id, str_or_none(e)))
-                    time.sleep(10)
-                    continue
-                finally:
-                    reconnect = True
+                    communicate_ws()
+                    break  # Disconnected
+                except BaseException as e:  # Including TransportError
+                    if stopped.is_set():
+                        break
+
+                    self.m3u8_lock.clear()  # m3u8 url may be changed
+
+                    self.to_screen('[{}] {}: Connection error occured, reconnecting after {} seconds: {}'.format(
+                        'niconico:live', video_id, self._WEBSOCKET_RECONNECT_DELAY, str_or_none(e)))
+                    time.sleep(self._WEBSOCKET_RECONNECT_DELAY)
+
+            self.m3u8_lock.set()  # Release possible locks

        thread = threading.Thread(target=ws_main, daemon=True)
        thread.start()

-        return dl.download(filename, new_info_dict)
+        try:
+            yield self
+        finally:
+            stopped.set()
+            if self.ws:
+                self.ws.close()
+            thread.join()
+
+    def _master_m3u8_url(self):
+        """ Get the refreshed manifest url after WebSocket reconnection to prevent HTTP 403 """
+
+        self.m3u8_lock.wait()
+        return self.m3u8_url
+
+    def real_download(self, filename, info_dict):
+        with self._ws_context(info_dict) as ws_context:
+            # live
+            if info_dict.get('is_live'):
+                info_dict = info_dict.copy()
+                info_dict['protocol'] = 'm3u8'
+                return FFmpegFD(self.ydl, self.params or {}).download(filename, info_dict)
+
+            # timeshift VOD
+            from ..extractor.niconico import NiconicoIE
+            ie = NiconicoIE(self.ydl)
+
+            video_id = info_dict['id']
+
+            # Get video info
+            total_duration = 0
+            fragment_duration = 0
+            for line in ie._download_webpage(info_dict['url'], video_id, note='Downloading m3u8').splitlines():
+                if '#STREAM-DURATION' in line:
+                    total_duration = int(float(line.split(':')[1]))
+                if '#EXT-X-TARGETDURATION' in line:
+                    fragment_duration = int(line.split(':')[1])
+            if not (total_duration and fragment_duration):
+                raise DownloadError('Unable to get required video info')
+
+            ctx = {
+                'filename': filename,
+                'total_frags': math.ceil(total_duration / fragment_duration),
+            }
+
+            self._prepare_and_start_frag_download(ctx, info_dict)
+
+            downloaded_duration = ctx['fragment_index'] * fragment_duration
+            while True:
+                if downloaded_duration > total_duration:
+                    break
+
+                retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry)
+                for retry in retry_manager:
+                    try:
+                        # Refresh master m3u8 (if possible) to get the new URL of the previously-chose format
+                        media_m3u8_url = ie._extract_m3u8_formats(
+                            ws_context._master_m3u8_url(), video_id, note=False,
+                            query={'start': downloaded_duration}, live=False)[0]['url']
+
+                        # Get all fragments
+                        media_m3u8 = ie._download_webpage(
+                            media_m3u8_url, video_id, note=False, errnote='Unable to download media m3u8')
+                        fragment_urls = traverse_obj(media_m3u8.splitlines(), (
+                            lambda _, v: not v.startswith('#'), {lambda url: urljoin(media_m3u8_url, url)}))
+
+                        with self.DurationLimiter(len(fragment_urls) * fragment_duration * self._PER_FRAGMENT_DOWNLOAD_RATIO):
+                            for fragment_url in fragment_urls:
+                                success = self._download_fragment(ctx, fragment_url, info_dict)
+                                if not success:
+                                    return False
+                                self._append_fragment(ctx, self._read_fragment(ctx))
+                                downloaded_duration += fragment_duration
+
+                    except (DownloadError, *network_exceptions) as err:
+                        retry.error = err
+                        continue
+
+                if retry_manager.error:
+                    return False
+
+            return self._finish_frag_download(ctx, info_dict)
+
+    class DurationLimiter:
+        def __init__(self, target):
+            self.target = target
+
+        def __enter__(self):
+            self.start = time.time()
+
+        def __exit__(self, *exc):
+            remaining = self.target - (time.time() - self.start)
+            if remaining > 0:
+                time.sleep(remaining)
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -946,6 +946,10 @@ from .kaltura import KalturaIE
 from .kankanews import KankaNewsIE
 from .karaoketv import KaraoketvIE
 from .kelbyone import KelbyOneIE
+from .kenh14 import (
+    Kenh14PlaylistIE,
+    Kenh14VideoIE,
+)
 from .khanacademy import (
    KhanAcademyIE,
    KhanAcademyUnitIE,
--- a/yt_dlp/extractor/chaturbate.py
+++ b/yt_dlp/extractor/chaturbate.py
@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
            'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
        }

-    def _extract_from_webpage(self, video_id, tld):
+    def _extract_from_html(self, video_id, tld):
        webpage = self._download_webpage(
            f'https://chaturbate.{tld}/{video_id}/', video_id,
            headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):

    def _real_extract(self, url):
        video_id, tld = self._match_valid_url(url).group('id', 'tld')
-        return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld)
+        return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)
--- a/yt_dlp/extractor/kenh14.py
+++ b/yt_dlp/extractor/kenh14.py
@ -0,0 +1,160 @@
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    extract_attributes,
+    get_element_by_class,
+    get_element_html_by_attribute,
+    get_elements_html_by_class,
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    remove_start,
+    strip_or_none,
+    unescapeHTML,
+    update_url,
+    url_or_none,
+)
+from ..utils.traversal import traverse_obj
+
+
+class Kenh14VideoIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
+    _TESTS = [{
+        'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
+        'md5': '1ed67f9c3a1e74acf15db69590cf6210',
+        'info_dict': {
+            'id': '316173',
+            'ext': 'mp4',
+            'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
+            'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
+            'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
+            'tags': [],
+            'uploader': 'Unbox Therapy',
+            'upload_date': '20220517',
+            'view_count': int,
+            'duration': 722.86,
+            'timestamp': 1652764468,
+        },
+    }, {
+        'url': 'https://video.kenh14.vn/video-316174.chn',
+        'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
+        'info_dict': {
+            'id': '316174',
+            'ext': 'mp4',
+            'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
+            'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
+            'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
+            'tags': [],
+            'upload_date': '20220517',
+            'view_count': int,
+            'duration': 70.04,
+            'timestamp': 1652766021,
+        },
+    }, {
+        'url': 'https://video.kenh14.vn/0-344740.chn',
+        'md5': 'b843495d5e728142c8870c09b46df2a9',
+        'info_dict': {
+            'id': '344740',
+            'ext': 'mov',
+            'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
+            'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
+            'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
+            'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
+            'uploader': 'Quang Vũ',
+            'upload_date': '20241024',
+            'view_count': int,
+            'duration': 198.88,
+            'timestamp': 1729741590,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
+        direct_url = attrs['data-vid']
+
+        metadata = self._download_json(
+            'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
+                remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
+
+        formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
+        subtitles = {}
+        video_data = self._download_json(
+            f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
+        if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
+            fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                hls_url, video_id, m3u8_id='hls', fatal=False)
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
+        if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
+            fmts, subs = self._extract_mpd_formats_and_subtitles(
+                dash_url, video_id, mpd_id='dash', fatal=False)
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
+
+        return {
+            **traverse_obj(metadata, {
+                'duration': ('duration', {parse_duration}),
+                'uploader': ('author', {strip_or_none}),
+                'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
+                'view_count': ('views', {int_or_none}),
+            }),
+            'id': video_id,
+            'title': (
+                traverse_obj(metadata, ('title', {strip_or_none}))
+                or clean_html(self._og_search_title(webpage))
+                or clean_html(get_element_by_class('vdbw-title', webpage))),
+            'formats': formats,
+            'subtitles': subtitles,
+            'description': (
+                clean_html(self._og_search_description(webpage))
+                or clean_html(get_element_by_class('vdbw-sapo', webpage))),
+            'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
+            'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
+                {lambda x: x.split(';')}, ..., filter)),
+        }
+
+
+class Kenh14PlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
+    _TESTS = [{
+        'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
+        'info_dict': {
+            'id': '71',
+            'title': 'Trần Tình (Naked love) mùa 2',
+            'description': 'md5:e9522339304956dea931722dd72eddb2',
+            'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
+        },
+        'playlist_count': 9,
+    }, {
+        'url': 'https://video.kenh14.vn/playlist/0-72.chn',
+        'info_dict': {
+            'id': '72',
+            'title': 'Lau Lại Đầu Từ',
+            'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
+            'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
+        },
+        'playlist_count': 6,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        category_detail = get_element_by_class('category-detail', webpage) or ''
+        embed_info = traverse_obj(
+            self._yield_json_ld(webpage, playlist_id),
+            (lambda _, v: v['name'] and v['alternateName'], any)) or {}
+
+        return self.playlist_from_matches(
+            get_elements_html_by_class('video-item', webpage), playlist_id,
+            (clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
+            getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
+            ie=Kenh14VideoIE, playlist_description=(
+                clean_html(get_element_by_class('description', category_detail))
+                or unescapeHTML(embed_info.get('alternateName'))),
+            thumbnail=traverse_obj(
+                self._og_search_thumbnail(webpage),
+                ({url_or_none}, {update_url(query=None)})))
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@ -7,7 +7,6 @@ import time
 import urllib.parse

 from .common import InfoExtractor, SearchInfoExtractor
-from ..networking import Request
 from ..networking.exceptions import HTTPError
 from ..utils import (
    ExtractorError,
@ -32,12 +31,56 @@ from ..utils import (
 )


-class NiconicoIE(InfoExtractor):
-    IE_NAME = 'niconico'
-    IE_DESC = 'ニコニコ動画'
+class NiconicoBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'niconico'
    _GEO_COUNTRIES = ['JP']
    _GEO_BYPASS = False

+    def _perform_login(self, username, password):
+        login_ok = True
+        login_form_strs = {
+            'mail_tel': username,
+            'password': password,
+        }
+        self._request_webpage(
+            'https://account.nicovideo.jp/login', None,
+            note='Acquiring Login session')
+        page = self._download_webpage(
+            'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None,
+            note='Logging in', errnote='Unable to log in',
+            data=urlencode_postdata(login_form_strs),
+            headers={
+                'Referer': 'https://account.nicovideo.jp/login',
+                'Content-Type': 'application/x-www-form-urlencoded',
+            })
+        if 'oneTimePw' in page:
+            post_url = self._search_regex(
+                r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, 'post url', group='url')
+            page = self._download_webpage(
+                urljoin('https://account.nicovideo.jp', post_url), None,
+                note='Performing MFA', errnote='Unable to complete MFA',
+                data=urlencode_postdata({
+                    'otp': self._get_tfa_info('6 digits code'),
+                }), headers={
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                })
+            if 'oneTimePw' in page or 'formError' in page:
+                err_msg = self._html_search_regex(
+                    r'formError["\']+>(.*?)</div>', page, 'form_error',
+                    default='There\'s an error but the message can\'t be parsed.',
+                    flags=re.DOTALL)
+                self.report_warning(f'Unable to log in: MFA challenge failed, "{err_msg}"')
+                return False
+        login_ok = 'class="notice error"' not in page
+        if not login_ok:
+            self.report_warning('Unable to log in: bad username or password')
+        return login_ok
+
+
+class NiconicoIE(NiconicoBaseIE):
+    IE_NAME = 'niconico'
+    IE_DESC = 'ニコニコ動画'
+
    _TESTS = [{
        'url': 'http://www.nicovideo.jp/watch/sm22312215',
        'info_dict': {
@ -176,7 +219,6 @@ class NiconicoIE(InfoExtractor):
    }]

    _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)'
-    _NETRC_MACHINE = 'niconico'
    _API_HEADERS = {
        'X-Frontend-ID': '6',
        'X-Frontend-Version': '0',
@ -185,46 +227,6 @@ class NiconicoIE(InfoExtractor):
        'Origin': 'https://www.nicovideo.jp',
    }

-    def _perform_login(self, username, password):
-        login_ok = True
-        login_form_strs = {
-            'mail_tel': username,
-            'password': password,
-        }
-        self._request_webpage(
-            'https://account.nicovideo.jp/login', None,
-            note='Acquiring Login session')
-        page = self._download_webpage(
-            'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None,
-            note='Logging in', errnote='Unable to log in',
-            data=urlencode_postdata(login_form_strs),
-            headers={
-                'Referer': 'https://account.nicovideo.jp/login',
-                'Content-Type': 'application/x-www-form-urlencoded',
-            })
-        if 'oneTimePw' in page:
-            post_url = self._search_regex(
-                r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, 'post url', group='url')
-            page = self._download_webpage(
-                urljoin('https://account.nicovideo.jp', post_url), None,
-                note='Performing MFA', errnote='Unable to complete MFA',
-                data=urlencode_postdata({
-                    'otp': self._get_tfa_info('6 digits code'),
-                }), headers={
-                    'Content-Type': 'application/x-www-form-urlencoded',
-                })
-            if 'oneTimePw' in page or 'formError' in page:
-                err_msg = self._html_search_regex(
-                    r'formError["\']+>(.*?)</div>', page, 'form_error',
-                    default='There\'s an error but the message can\'t be parsed.',
-                    flags=re.DOTALL)
-                self.report_warning(f'Unable to log in: MFA challenge failed, "{err_msg}"')
-                return False
-        login_ok = 'class="notice error"' not in page
-        if not login_ok:
-            self.report_warning('Unable to log in: bad username or password')
-        return login_ok
-
    def _get_heartbeat_info(self, info_dict):
        video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/')
        dmc_protocol = info_dict['expected_protocol']
@ -906,7 +908,7 @@ class NiconicoUserIE(InfoExtractor):
        return self.playlist_result(self._entries(list_id), list_id)


-class NiconicoLiveIE(InfoExtractor):
+class NiconicoLiveIE(NiconicoBaseIE):
    IE_NAME = 'niconico:live'
    IE_DESC = 'ニコニコ生放送'
    _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)'
@ -916,17 +918,30 @@ class NiconicoLiveIE(InfoExtractor):
        'info_dict': {
            'id': 'lv339533123',
            'title': '激辛ペヤング食べます\u202a( ;ᯅ; )\u202c（歌枠オーディション参加中）',
-            'view_count': 1526,
-            'comment_count': 1772,
+            'view_count': int,
+            'comment_count': int,
            'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます',
            'uploader': 'もか',
            'channel': 'ゲストさんのコミュニティ',
            'channel_id': 'co5776900',
            'channel_url': 'https://com.nicovideo.jp/community/co5776900',
            'timestamp': 1670677328,
-            'is_live': True,
+            'ext': None,
+            'live_latency': 'high',
+            'live_status': 'was_live',
+            'thumbnail': r're:^https://[\w.-]+/\w+/\w+',
+            'thumbnails': list,
+            'upload_date': '20221210',
        },
-        'skip': 'livestream',
+        'params': {
+            'skip_download': True,
+            'ignore_no_formats_error': True,
+        },
+        'expected_warnings': [
+            'The live hasn\'t started yet or already ended.',
+            'No video formats found!',
+            'Requested format is not available',
+        ],
    }, {
        'url': 'https://live2.nicovideo.jp/watch/lv339533123',
        'only_matching': True,
@ -940,36 +955,17 @@ class NiconicoLiveIE(InfoExtractor):

    _KNOWN_LATENCY = ('high', 'low')

-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
-
-        embedded_data = self._parse_json(unescapeHTML(self._search_regex(
-            r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id)
-
-        ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl'))
-        if not ws_url:
-            raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True)
-        ws_url = update_url_query(ws_url, {
-            'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
-        })
-
-        hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.')
-        latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
-        if latency not in self._KNOWN_LATENCY:
-            latency = 'high'
-
+    def _yield_formats(self, ws_url, headers, latency, video_id, is_live):
        ws = self._request_webpage(
-            Request(ws_url, headers={'Origin': f'https://{hostname}'}),
-            video_id=video_id, note='Connecting to WebSocket server')
+            ws_url, video_id, note='Connecting to WebSocket server', headers=headers)

-        self.write_debug('[debug] Sending HLS server request')
+        self.write_debug('Sending HLS server request')
        ws.send(json.dumps({
            'type': 'startWatching',
            'data': {
                'stream': {
                    'quality': 'abr',
-                    'protocol': 'hls+fmp4',
+                    'protocol': 'hls',
                    'latency': latency,
                    'chasePlay': False,
                },
@ -977,32 +973,55 @@ class NiconicoLiveIE(InfoExtractor):
                    'protocol': 'webSocket',
                    'commentable': True,
                },
-                'reconnect': False,
            },
        }))

-        while True:
-            recv = ws.recv()
-            if not recv:
-                continue
-            data = json.loads(recv)
-            if not isinstance(data, dict):
-                continue
-            if data.get('type') == 'stream':
-                m3u8_url = data['data']['uri']
-                qualities = data['data']['availableQualities']
-                break
-            elif data.get('type') == 'disconnect':
-                self.write_debug(recv)
-                raise ExtractorError('Disconnected at middle of extraction')
-            elif data.get('type') == 'error':
-                self.write_debug(recv)
-                message = traverse_obj(data, ('body', 'code')) or recv
-                raise ExtractorError(message)
-            elif self.get_param('verbose', False):
-                if len(recv) > 100:
-                    recv = recv[:100] + '...'
-                self.write_debug(f'Server said: {recv}')
+        with ws:
+            while True:
+                recv = ws.recv()
+                if not recv:
+                    continue
+                data = json.loads(recv)
+                if not isinstance(data, dict):
+                    continue
+                if data.get('type') == 'stream':
+                    m3u8_url = data['data']['uri']
+                    qualities = data['data']['availableQualities']
+                    break
+                elif data.get('type') == 'disconnect':
+                    self.write_debug(data)
+                    raise ExtractorError('Disconnected at middle of extraction')
+                elif data.get('type') == 'error':
+                    self.write_debug(data)
+                    message = traverse_obj(data, ('data', 'code')) or recv
+                    raise ExtractorError(message)
+                elif self.get_param('verbose', False):
+                    if len(recv) > 100:
+                        recv = recv[:100] + '...'
+                    self.write_debug(f'Server said: {recv}')
+
+        formats = sorted(self._extract_m3u8_formats(
+            m3u8_url, video_id, ext='mp4', live=is_live), key=lambda f: f['tbr'], reverse=True)
+        for fmt, q in zip(formats, qualities[1:]):
+            fmt.update({
+                'format_id': q,
+                'protocol': 'niconico_live',
+            })
+            yield fmt
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
+        headers = {'Origin': 'https://' + remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.')}
+
+        embedded_data = self._parse_json(unescapeHTML(self._search_regex(
+            r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id)
+
+        ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl'))
+        if ws_url:
+            ws_url = update_url_query(ws_url, {
+                'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
+            })

        title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta(
            ('og:title', 'twitter:title'), webpage, 'live title', fatal=False)
@ -1028,16 +1047,19 @@ class NiconicoLiveIE(InfoExtractor):
                    **res,
                })

-        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True)
-        for fmt, q in zip(formats, reversed(qualities[1:])):
-            fmt.update({
-                'format_id': q,
-                'protocol': 'niconico_live',
-                'ws': ws,
-                'video_id': video_id,
-                'live_latency': latency,
-                'origin': hostname,
-            })
+        live_status, availability = self._check_status_and_availability(embedded_data, video_id)
+
+        if availability == 'premium_only':
+            self.raise_login_required('This video requires premium', metadata_available=True)
+        elif availability == 'subscriber_only':
+            self.raise_login_required('This video is for members only', metadata_available=True)
+        elif availability == 'needs_auth':
+            # PPV or tickets for limited time viewing
+            self.raise_login_required('This video requires additional steps to watch', metadata_available=True)
+
+        latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
+        if latency not in self._KNOWN_LATENCY:
+            latency = 'high'

        return {
            'id': video_id,
@ -1052,7 +1074,79 @@ class NiconicoLiveIE(InfoExtractor):
            }),
            'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))),
            'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))),
-            'is_live': True,
+            'live_status': live_status,
+            'availability': availability,
            'thumbnails': thumbnails,
-            'formats': formats,
+            'formats': [*self._yield_formats(
+                ws_url, headers, latency, video_id, live_status == 'is_live')] if ws_url else None,
+            'http_headers': headers,
+            'downloader_options': {
+                'live_latency': latency,
+                'ws_url': ws_url,
+            },
        }
+
+    def _check_status_and_availability(self, embedded_data, video_id):
+        live_status = {
+            'Before': 'is_live',
+            'Open': 'was_live',
+            'End': 'was_live',
+        }.get(traverse_obj(embedded_data, ('programTimeshift', 'publication', 'status', {str})), 'is_live')
+
+        if traverse_obj(embedded_data, ('userProgramWatch', 'canWatch', {bool})):
+            is_member_free = traverse_obj(embedded_data, ('program', 'isMemberFree', {bool}))
+            is_shown = traverse_obj(embedded_data, ('program', 'trialWatch', 'isShown', {bool}))
+            self.write_debug(f'.program.isMemberFree: {is_member_free}; .program.trialWatch.isShown: {is_shown}')
+
+            if is_member_free is None and is_shown is None:
+                return live_status, self._availability()
+
+            if is_member_free is False:
+                availability = {'needs_auth': True}
+                msg = 'Paid content cannot be accessed, the video may be blank.'
+            else:
+                availability = {'needs_subscription': True}
+                msg = 'Restricted content cannot be accessed, a part of the video or the entire video may be blank.'
+            self.report_warning(msg, video_id)
+            return live_status, self._availability(**availability)
+
+        if traverse_obj(embedded_data, ('userProgramWatch', 'isCountryRestrictionTarget', {bool})):
+            self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
+            return live_status, self._availability()
+
+        rejected_reasons = traverse_obj(embedded_data, ('userProgramWatch', 'rejectedReasons', ..., {str}))
+        self.write_debug(f'.userProgramWatch.rejectedReasons: {rejected_reasons!r}')
+
+        if 'programNotBegun' in rejected_reasons:
+            self.report_warning('Live has not started', video_id)
+            live_status = 'is_upcoming'
+        elif 'timeshiftBeforeOpen' in rejected_reasons:
+            self.report_warning('Live has ended but timeshift is not yet processed', video_id)
+            live_status = 'post_live'
+        elif 'noTimeshiftProgram' in rejected_reasons:
+            self.report_warning('Timeshift is disabled', video_id)
+            live_status = 'was_live'
+        elif any(x in ['timeshiftClosed', 'timeshiftClosedAndNotFollow'] for x in rejected_reasons):
+            self.report_warning('Timeshift viewing period has ended', video_id)
+            live_status = 'was_live'
+
+        availability = self._availability(needs_premium='notLogin' in rejected_reasons, needs_subscription=any(x in [
+            'notSocialGroupMember',
+            'notCommunityMember',
+            'notChannelMember',
+            'notCommunityMemberAndNotHaveTimeshiftTicket',
+            'notChannelMemberAndNotHaveTimeshiftTicket',
+        ] for x in rejected_reasons), needs_auth=any(x in [
+            'timeshiftTicketExpired',
+            'notHaveTimeshiftTicket',
+            'notCommunityMemberAndNotHaveTimeshiftTicket',
+            'notChannelMemberAndNotHaveTimeshiftTicket',
+            'notHavePayTicket',
+            'notActivatedBySerial',
+            'notHavePayTicketAndNotActivatedBySerial',
+            'notUseTimeshiftTicket',
+            'notUseTimeshiftTicketOnOnceTimeshift',
+            'notUseTimeshiftTicketOnUnlimitedTimeshift',
+        ] for x in rejected_reasons))
+
+        return live_status, availability
--- a/yt_dlp/postprocessor/ffmpeg.py
+++ b/yt_dlp/postprocessor/ffmpeg.py
@ -887,7 +887,7 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor):
 class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
    def _needs_fixup(self, info):
        yield info['ext'] in ('mp4', 'm4a')
-        yield info['protocol'].startswith('m3u8')
+        yield info['protocol'].startswith('m3u8') or info['protocol'] == 'niconico_live'
        try:
            metadata = self.get_metadata_object(info['filepath'])
        except PostProcessingError as e:
Author	SHA1	Message	Date
Mozi	6b09e928bb	Merge `3095d815c9` into `eb15fd5a32`	2024-11-17 21:14:57 +05:30
krichbanana	eb15fd5a32	[ie/kenh14] Add extractor (#3996 ) Closes #3937 Authored by: krichbanana, pzhlkj6612 Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>	2024-11-17 14:12:26 +00:00
sepro	7cecd299e4	[ie/chaturbate] Don't break embed detection (#11565 ) Bugfix for `720b3dc453` Authored by: seproDev	2024-11-17 13:32:12 +01:00
Mozi	3095d815c9	no all() for both things	2024-10-19 18:22:53 +00:00
Mozi	06bd726ab3	Fix formats sorting; simplify m3u8 extraction in downloader; clean code	2024-10-19 17:59:18 +00:00
Mozi	52d9594ea6	merge 'master'	2024-10-19 16:02:08 +00:00
Mozi	e720e8879d	do not rename protocol; add conditions to FFmpegFixupM3u8PP	2024-08-20 16:16:47 +00:00
Mozi	fe592cd6ab	merge 'master'; new code seems to still work!	2024-08-20 15:12:36 +00:00
Mozi	61fd2648d2	merge 'master'	2024-08-20 14:42:35 +00:00
Mozi	feaefd8ec6	pythonic way to get the format_index	2024-08-20 14:41:50 +00:00
Mozi	dcefdfe508	apply future Ruff rules	2024-08-20 14:41:50 +00:00
Mozi	1e23756e50	[ie/niconico] raise_login_required() for all possible availabilities	2024-06-01 02:19:03 +00:00
Mozi	efe4b7101a	[ie/niconico] support login for niconico live	2024-05-31 18:31:49 +00:00
Mozi	365e615d11	merge 'master'	2024-05-31 18:25:30 +00:00
Mozi	f65ad7f3c2	[ie/niconico] adjust the warning about the blank part in videos	2024-05-31 18:16:47 +00:00
Mozi	53a7fcc231	[ie/niconico] accurately check live status; add availability check	2024-05-30 16:19:02 +00:00
Mozi	31c13e92e2	merge 'master'	2024-05-27 14:15:06 +00:00
Mozi	fe29c67a14	use urllib.parse.urlparse()	2024-05-26 16:03:48 +00:00
Mozi	60f51dec60	merge 'master'	2024-05-26 15:56:08 +00:00
Mozi	28c242d82c	null check for "self.ws"	2024-03-12 16:03:20 +00:00
Mozi	d9a6507fe6	[ie/niconico] Support "--load-info-json" by saving WebSocket url aka "--load-info". Don't save a Response object to info JSON. Just create a new WebSocket connection during the download. Due to Niconico's logic, the manifest m3u8 url will be unusable soon if there is no active WebSocket connection, so the reconnection will give us a valid manifest m3u8, unless the WebSocket url has already expired.	2024-03-12 15:58:15 +00:00
Mozi	972a2d51ad	[ie/niconico] Apply suggestions: info_dict, protocols and downloaders - Use "downloader_options" to pass options used by the downloader. - Combine the two downloaders into one. - Don't inherit from "HlsFD". Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>	2024-03-12 15:06:38 +00:00
Mozi	7398a7cb2f	Merge branch 'origin/master'	2024-03-11 16:19:11 +00:00
Mozi	51681d1294	[ie/niconico] Use "network_exceptions"; add errnote for m3u8 download	2024-03-11 16:18:38 +00:00
Mozi	41c6125907	[ie/niconico] Move WebSocket headers to info_dict Thanks for coletdjnz's suggestion!	2024-03-11 15:29:20 +00:00
Mozi	16974726a4	[ie/niconico] Directly download live timeshift videos; WebSocket fixes Major changes: - Make a downloader for live timeshift videos. Time-based download rate limit applies. RetryManager-based error recovery applies. - Fix the incorrect url for WebSocket reconnection. - Correctly close the WebSocket connection. - [!] Apply "FFmpegFixupM3u8PP" for both non-timeshift and timeshift MPEG-TS files by adding "m3u8_*" prefixes and inheriting from "HlsFD". - [!] Change the protocol from "hls+fmp4" to "hls" in "startWatching" WebSocket requests because I didn't see it in my test. Minor changes: - Support metadata extraction when no formats. - Set "live_status" instead of "is_live". - Clean up "info_dict": Change WebSocket configs to private to hide them from users; extract common fields and remove unused ones. - Update a download test.	2024-03-10 07:53:59 +00:00