Compare commits

..

18 Commits

Author SHA1 Message Date
bashonly
4832d9da61
[ie/duoplay] Remove expired test
Authored by: bashonly
2023-11-20 17:52:30 -06:00
bashonly
1a48231546
Remove duplicate websockets from requirements.txt
Authored by: bashonly
2023-11-20 17:49:26 -06:00
bashonly
a9a93d7f5e
Merge branch 'yt-dlp:master' into cleanup/2023-12 2023-11-20 17:34:03 -06:00
pk
a0b19d319a
[core] Support NO_COLOR environment variable (#8385)
Authored by: prettykool, Grub4K
2023-11-20 23:43:52 +01:00
middlingphys
cc07f5cc85
[ie/abematv] Fix season metadata (#8607)
Authored by: middlingphys
2023-11-20 22:39:12 +00:00
coletdjnz
ccfd70f4c2
[rh:websockets] Migrate websockets to networking framework (#7720)
* Adds a basic WebSocket framework
* Introduces new minimum `websockets` version of 12.0
* Deprecates `WebSocketsWrapper`

Fixes https://github.com/yt-dlp/yt-dlp/issues/8439

Authored by: coletdjnz
2023-11-20 08:04:04 +00:00
sepro
45d82be65f
[ie/nebula] Overhaul extractors (#8566)
Closes #4300, Closes #5814, Closes #7588, Closes #6334, Closes #6538
Authored by: elyse0, pukkandan, seproDev

Co-authored-by: Elyse <26639800+elyse0@users.noreply.github.com>
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
2023-11-20 01:03:33 +00:00
Safouane Aarab
3237f8ba29
[ie/allstar] Add extractors (#8274)
Closes #6917
Authored by: S-Aarab
2023-11-20 00:07:19 +00:00
Kyraminol Endyeran
1725e943b0
[ie/vvvvid] Set user-agent to fix extraction (#8615)
Authored by: Kyraminol
2023-11-19 21:30:21 +00:00
c-basalt
9f09bdcfcb
[ie/bilibili] Support courses and interactive videos (#8343)
Closes #6135, Closes #8428
Authored by: c-basalt
2023-11-19 21:26:46 +00:00
Simon Sawicki
f124fa4588
[ci] Concurrency optimizations (#8614)
Authored by: Grub4K
2023-11-19 16:05:13 +01:00
JC-Chung
585d0ed9ab
[ie/twitcasting] Detect livestreams via API and show page (#8601)
Authored by: JC-Chung, bashonly
2023-11-18 22:14:45 +00:00
SirElderling
1fa3f24d4b
[ie/theguardian] Add extractors (#8535)
Closes #8520
Authored by: SirElderling
2023-11-18 21:54:00 +00:00
sepro
ddb2d7588b
[ie] Extract from media elements in SMIL manifests (#8504)
Authored by: seproDev
2023-11-18 21:51:18 +00:00
qbnu
f223b1b078
[ie/vocaroo] Do not use deprecated getheader (#8606)
Authored by: qbnu
2023-11-18 21:49:23 +00:00
Berkay
6fe82491ed
[ie/twitter:broadcast] Extract concurrent_view_count (#8600)
Authored by: sonmezberkay
2023-11-18 21:46:22 +00:00
sepro
34df1c1f60
[ie/vidly] Add extractor (#8612)
Authored by: seproDev
2023-11-18 20:28:25 +00:00
Simon Sawicki
1d24da6c89
[ie/nintendo] Fix Nintendo Direct extraction (#8609)
Authored by: Grub4K
2023-11-18 21:04:42 +01:00
31 changed files with 2258 additions and 528 deletions

View File

@ -3,6 +3,10 @@ on: [push, pull_request]
permissions: permissions:
contents: read contents: read
concurrency:
group: core-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
jobs: jobs:
tests: tests:
name: Core Tests name: Core Tests

View File

@ -10,7 +10,6 @@ on:
- "pyinst.py" - "pyinst.py"
concurrency: concurrency:
group: release-master group: release-master
cancel-in-progress: true
permissions: permissions:
contents: read contents: read

View File

@ -1,8 +1,8 @@
mutagen mutagen
pycryptodomex pycryptodomex
websockets
brotli; implementation_name=='cpython' brotli; implementation_name=='cpython'
brotlicffi; implementation_name!='cpython' brotlicffi; implementation_name!='cpython'
certifi certifi
requests>=2.31.0,<3 requests>=2.31.0,<3
urllib3>=1.26.17,<3 urllib3>=1.26.17,<3
websockets>=12.0

View File

@ -19,3 +19,8 @@ def handler(request):
pytest.skip(f'{RH_KEY} request handler is not available') pytest.skip(f'{RH_KEY} request handler is not available')
return functools.partial(handler, logger=FakeLogger) return functools.partial(handler, logger=FakeLogger)
def validate_and_send(rh, req):
rh.validate(req)
return rh.send(req)

View File

@ -214,8 +214,9 @@ def sanitize_got_info_dict(got_dict):
test_info_dict = { test_info_dict = {
key: sanitize(key, value) for key, value in got_dict.items() key: sanitize(key, value) for key, value in got_dict.items()
if value is not None and key not in IGNORED_FIELDS and not any( if value is not None and key not in IGNORED_FIELDS and (
key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES) not any(key.startswith(f'{prefix}_') for prefix in IGNORED_PREFIXES)
or key == '_old_archive_ids')
} }
# display_id may be generated from id # display_id may be generated from id

View File

@ -52,6 +52,8 @@ from yt_dlp.networking.exceptions import (
from yt_dlp.utils._utils import _YDLLogger as FakeLogger from yt_dlp.utils._utils import _YDLLogger as FakeLogger
from yt_dlp.utils.networking import HTTPHeaderDict from yt_dlp.utils.networking import HTTPHeaderDict
from test.conftest import validate_and_send
TEST_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_DIR = os.path.dirname(os.path.abspath(__file__))
@ -275,11 +277,6 @@ class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler):
self._headers_buffer.append(f'{keyword}: {value}\r\n'.encode()) self._headers_buffer.append(f'{keyword}: {value}\r\n'.encode())
def validate_and_send(rh, req):
rh.validate(req)
return rh.send(req)
class TestRequestHandlerBase: class TestRequestHandlerBase:
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
@ -872,8 +869,9 @@ class TestRequestsRequestHandler(TestRequestHandlerBase):
]) ])
@pytest.mark.parametrize('handler', ['Requests'], indirect=True) @pytest.mark.parametrize('handler', ['Requests'], indirect=True)
def test_response_error_mapping(self, handler, monkeypatch, raised, expected, match): def test_response_error_mapping(self, handler, monkeypatch, raised, expected, match):
from urllib3.response import HTTPResponse as Urllib3Response
from requests.models import Response as RequestsResponse from requests.models import Response as RequestsResponse
from urllib3.response import HTTPResponse as Urllib3Response
from yt_dlp.networking._requests import RequestsResponseAdapter from yt_dlp.networking._requests import RequestsResponseAdapter
requests_res = RequestsResponse() requests_res = RequestsResponse()
requests_res.raw = Urllib3Response(body=b'', status=200) requests_res.raw = Urllib3Response(body=b'', status=200)
@ -929,13 +927,17 @@ class TestRequestHandlerValidation:
('http', False, {}), ('http', False, {}),
('https', False, {}), ('https', False, {}),
]), ]),
('Websockets', [
('ws', False, {}),
('wss', False, {}),
]),
(NoCheckRH, [('http', False, {})]), (NoCheckRH, [('http', False, {})]),
(ValidationRH, [('http', UnsupportedRequest, {})]) (ValidationRH, [('http', UnsupportedRequest, {})])
] ]
PROXY_SCHEME_TESTS = [ PROXY_SCHEME_TESTS = [
# scheme, expected to fail # scheme, expected to fail
('Urllib', [ ('Urllib', 'http', [
('http', False), ('http', False),
('https', UnsupportedRequest), ('https', UnsupportedRequest),
('socks4', False), ('socks4', False),
@ -944,7 +946,7 @@ class TestRequestHandlerValidation:
('socks5h', False), ('socks5h', False),
('socks', UnsupportedRequest), ('socks', UnsupportedRequest),
]), ]),
('Requests', [ ('Requests', 'http', [
('http', False), ('http', False),
('https', False), ('https', False),
('socks4', False), ('socks4', False),
@ -952,8 +954,11 @@ class TestRequestHandlerValidation:
('socks5', False), ('socks5', False),
('socks5h', False), ('socks5h', False),
]), ]),
(NoCheckRH, [('http', False)]), (NoCheckRH, 'http', [('http', False)]),
(HTTPSupportedRH, [('http', UnsupportedRequest)]), (HTTPSupportedRH, 'http', [('http', UnsupportedRequest)]),
('Websockets', 'ws', [('http', UnsupportedRequest)]),
(NoCheckRH, 'http', [('http', False)]),
(HTTPSupportedRH, 'http', [('http', UnsupportedRequest)]),
] ]
PROXY_KEY_TESTS = [ PROXY_KEY_TESTS = [
@ -972,7 +977,7 @@ class TestRequestHandlerValidation:
] ]
EXTENSION_TESTS = [ EXTENSION_TESTS = [
('Urllib', [ ('Urllib', 'http', [
({'cookiejar': 'notacookiejar'}, AssertionError), ({'cookiejar': 'notacookiejar'}, AssertionError),
({'cookiejar': YoutubeDLCookieJar()}, False), ({'cookiejar': YoutubeDLCookieJar()}, False),
({'cookiejar': CookieJar()}, AssertionError), ({'cookiejar': CookieJar()}, AssertionError),
@ -980,17 +985,21 @@ class TestRequestHandlerValidation:
({'timeout': 'notatimeout'}, AssertionError), ({'timeout': 'notatimeout'}, AssertionError),
({'unsupported': 'value'}, UnsupportedRequest), ({'unsupported': 'value'}, UnsupportedRequest),
]), ]),
('Requests', [ ('Requests', 'http', [
({'cookiejar': 'notacookiejar'}, AssertionError), ({'cookiejar': 'notacookiejar'}, AssertionError),
({'cookiejar': YoutubeDLCookieJar()}, False), ({'cookiejar': YoutubeDLCookieJar()}, False),
({'timeout': 1}, False), ({'timeout': 1}, False),
({'timeout': 'notatimeout'}, AssertionError), ({'timeout': 'notatimeout'}, AssertionError),
({'unsupported': 'value'}, UnsupportedRequest), ({'unsupported': 'value'}, UnsupportedRequest),
]), ]),
(NoCheckRH, [ (NoCheckRH, 'http', [
({'cookiejar': 'notacookiejar'}, False), ({'cookiejar': 'notacookiejar'}, False),
({'somerandom': 'test'}, False), # but any extension is allowed through ({'somerandom': 'test'}, False), # but any extension is allowed through
]), ]),
('Websockets', 'ws', [
({'cookiejar': YoutubeDLCookieJar()}, False),
({'timeout': 2}, False),
]),
] ]
@pytest.mark.parametrize('handler,scheme,fail,handler_kwargs', [ @pytest.mark.parametrize('handler,scheme,fail,handler_kwargs', [
@ -1016,14 +1025,14 @@ class TestRequestHandlerValidation:
run_validation(handler, fail, Request('http://', proxies={proxy_key: 'http://example.com'})) run_validation(handler, fail, Request('http://', proxies={proxy_key: 'http://example.com'}))
run_validation(handler, fail, Request('http://'), proxies={proxy_key: 'http://example.com'}) run_validation(handler, fail, Request('http://'), proxies={proxy_key: 'http://example.com'})
@pytest.mark.parametrize('handler,scheme,fail', [ @pytest.mark.parametrize('handler,req_scheme,scheme,fail', [
(handler_tests[0], scheme, fail) (handler_tests[0], handler_tests[1], scheme, fail)
for handler_tests in PROXY_SCHEME_TESTS for handler_tests in PROXY_SCHEME_TESTS
for scheme, fail in handler_tests[1] for scheme, fail in handler_tests[2]
], indirect=['handler']) ], indirect=['handler'])
def test_proxy_scheme(self, handler, scheme, fail): def test_proxy_scheme(self, handler, req_scheme, scheme, fail):
run_validation(handler, fail, Request('http://', proxies={'http': f'{scheme}://example.com'})) run_validation(handler, fail, Request(f'{req_scheme}://', proxies={req_scheme: f'{scheme}://example.com'}))
run_validation(handler, fail, Request('http://'), proxies={'http': f'{scheme}://example.com'}) run_validation(handler, fail, Request(f'{req_scheme}://'), proxies={req_scheme: f'{scheme}://example.com'})
@pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH, 'Requests'], indirect=True) @pytest.mark.parametrize('handler', ['Urllib', HTTPSupportedRH, 'Requests'], indirect=True)
def test_empty_proxy(self, handler): def test_empty_proxy(self, handler):
@ -1035,14 +1044,14 @@ class TestRequestHandlerValidation:
def test_invalid_proxy_url(self, handler, proxy_url): def test_invalid_proxy_url(self, handler, proxy_url):
run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': proxy_url})) run_validation(handler, UnsupportedRequest, Request('http://', proxies={'http': proxy_url}))
@pytest.mark.parametrize('handler,extensions,fail', [ @pytest.mark.parametrize('handler,scheme,extensions,fail', [
(handler_tests[0], extensions, fail) (handler_tests[0], handler_tests[1], extensions, fail)
for handler_tests in EXTENSION_TESTS for handler_tests in EXTENSION_TESTS
for extensions, fail in handler_tests[1] for extensions, fail in handler_tests[2]
], indirect=['handler']) ], indirect=['handler'])
def test_extension(self, handler, extensions, fail): def test_extension(self, handler, scheme, extensions, fail):
run_validation( run_validation(
handler, fail, Request('http://', extensions=extensions)) handler, fail, Request(f'{scheme}://', extensions=extensions))
def test_invalid_request_type(self): def test_invalid_request_type(self):
rh = self.ValidationRH(logger=FakeLogger()) rh = self.ValidationRH(logger=FakeLogger())
@ -1075,6 +1084,22 @@ class FakeRHYDL(FakeYDL):
self._request_director = self.build_request_director([FakeRH]) self._request_director = self.build_request_director([FakeRH])
class AllUnsupportedRHYDL(FakeYDL):
def __init__(self, *args, **kwargs):
class UnsupportedRH(RequestHandler):
def _send(self, request: Request):
pass
_SUPPORTED_FEATURES = ()
_SUPPORTED_PROXY_SCHEMES = ()
_SUPPORTED_URL_SCHEMES = ()
super().__init__(*args, **kwargs)
self._request_director = self.build_request_director([UnsupportedRH])
class TestRequestDirector: class TestRequestDirector:
def test_handler_operations(self): def test_handler_operations(self):
@ -1234,6 +1259,12 @@ class TestYoutubeDLNetworking:
with pytest.raises(RequestError, match=r'file:// URLs are disabled by default'): with pytest.raises(RequestError, match=r'file:// URLs are disabled by default'):
ydl.urlopen('file://') ydl.urlopen('file://')
@pytest.mark.parametrize('scheme', (['ws', 'wss']))
def test_websocket_unavailable_error(self, scheme):
with AllUnsupportedRHYDL() as ydl:
with pytest.raises(RequestError, match=r'This request requires WebSocket support'):
ydl.urlopen(f'{scheme}://')
def test_legacy_server_connect_error(self): def test_legacy_server_connect_error(self):
with FakeRHYDL() as ydl: with FakeRHYDL() as ydl:
for error in ('UNSAFE_LEGACY_RENEGOTIATION_DISABLED', 'SSLV3_ALERT_HANDSHAKE_FAILURE'): for error in ('UNSAFE_LEGACY_RENEGOTIATION_DISABLED', 'SSLV3_ALERT_HANDSHAKE_FAILURE'):

View File

@ -210,6 +210,16 @@ class SocksHTTPTestRequestHandler(http.server.BaseHTTPRequestHandler, SocksTestR
self.wfile.write(payload.encode()) self.wfile.write(payload.encode())
class SocksWebSocketTestRequestHandler(SocksTestRequestHandler):
def handle(self):
import websockets.sync.server
protocol = websockets.ServerProtocol()
connection = websockets.sync.server.ServerConnection(socket=self.request, protocol=protocol, close_timeout=0)
connection.handshake()
connection.send(json.dumps(self.socks_info))
connection.close()
@contextlib.contextmanager @contextlib.contextmanager
def socks_server(socks_server_class, request_handler, bind_ip=None, **socks_server_kwargs): def socks_server(socks_server_class, request_handler, bind_ip=None, **socks_server_kwargs):
server = server_thread = None server = server_thread = None
@ -252,8 +262,22 @@ class HTTPSocksTestProxyContext(SocksProxyTestContext):
return json.loads(handler.send(request).read().decode()) return json.loads(handler.send(request).read().decode())
class WebSocketSocksTestProxyContext(SocksProxyTestContext):
REQUEST_HANDLER_CLASS = SocksWebSocketTestRequestHandler
def socks_info_request(self, handler, target_domain=None, target_port=None, **req_kwargs):
request = Request(f'ws://{target_domain or "127.0.0.1"}:{target_port or "40000"}', **req_kwargs)
handler.validate(request)
ws = handler.send(request)
ws.send('socks_info')
socks_info = ws.recv()
ws.close()
return json.loads(socks_info)
CTX_MAP = { CTX_MAP = {
'http': HTTPSocksTestProxyContext, 'http': HTTPSocksTestProxyContext,
'ws': WebSocketSocksTestProxyContext,
} }
@ -263,7 +287,7 @@ def ctx(request):
class TestSocks4Proxy: class TestSocks4Proxy:
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks4_no_auth(self, handler, ctx): def test_socks4_no_auth(self, handler, ctx):
with handler() as rh: with handler() as rh:
with ctx.socks_server(Socks4ProxyHandler) as server_address: with ctx.socks_server(Socks4ProxyHandler) as server_address:
@ -271,7 +295,7 @@ class TestSocks4Proxy:
rh, proxies={'all': f'socks4://{server_address}'}) rh, proxies={'all': f'socks4://{server_address}'})
assert response['version'] == 4 assert response['version'] == 4
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks4_auth(self, handler, ctx): def test_socks4_auth(self, handler, ctx):
with handler() as rh: with handler() as rh:
with ctx.socks_server(Socks4ProxyHandler, user_id='user') as server_address: with ctx.socks_server(Socks4ProxyHandler, user_id='user') as server_address:
@ -281,7 +305,7 @@ class TestSocks4Proxy:
rh, proxies={'all': f'socks4://user:@{server_address}'}) rh, proxies={'all': f'socks4://user:@{server_address}'})
assert response['version'] == 4 assert response['version'] == 4
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks4a_ipv4_target(self, handler, ctx): def test_socks4a_ipv4_target(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler) as server_address: with ctx.socks_server(Socks4ProxyHandler) as server_address:
with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh:
@ -289,7 +313,7 @@ class TestSocks4Proxy:
assert response['version'] == 4 assert response['version'] == 4
assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1') assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1')
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks4a_domain_target(self, handler, ctx): def test_socks4a_domain_target(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler) as server_address: with ctx.socks_server(Socks4ProxyHandler) as server_address:
with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh:
@ -298,7 +322,7 @@ class TestSocks4Proxy:
assert response['ipv4_address'] is None assert response['ipv4_address'] is None
assert response['domain_address'] == 'localhost' assert response['domain_address'] == 'localhost'
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_ipv4_client_source_address(self, handler, ctx): def test_ipv4_client_source_address(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler) as server_address: with ctx.socks_server(Socks4ProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}' source_address = f'127.0.0.{random.randint(5, 255)}'
@ -308,7 +332,7 @@ class TestSocks4Proxy:
assert response['client_address'][0] == source_address assert response['client_address'][0] == source_address
assert response['version'] == 4 assert response['version'] == 4
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
@pytest.mark.parametrize('reply_code', [ @pytest.mark.parametrize('reply_code', [
Socks4CD.REQUEST_REJECTED_OR_FAILED, Socks4CD.REQUEST_REJECTED_OR_FAILED,
Socks4CD.REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD, Socks4CD.REQUEST_REJECTED_CANNOT_CONNECT_TO_IDENTD,
@ -320,7 +344,7 @@ class TestSocks4Proxy:
with pytest.raises(ProxyError): with pytest.raises(ProxyError):
ctx.socks_info_request(rh) ctx.socks_info_request(rh)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_ipv6_socks4_proxy(self, handler, ctx): def test_ipv6_socks4_proxy(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address: with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address:
with handler(proxies={'all': f'socks4://{server_address}'}) as rh: with handler(proxies={'all': f'socks4://{server_address}'}) as rh:
@ -329,7 +353,7 @@ class TestSocks4Proxy:
assert response['ipv4_address'] == '127.0.0.1' assert response['ipv4_address'] == '127.0.0.1'
assert response['version'] == 4 assert response['version'] == 4
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_timeout(self, handler, ctx): def test_timeout(self, handler, ctx):
with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address: with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address:
with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh: with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh:
@ -339,7 +363,7 @@ class TestSocks4Proxy:
class TestSocks5Proxy: class TestSocks5Proxy:
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks5_no_auth(self, handler, ctx): def test_socks5_no_auth(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
@ -347,7 +371,7 @@ class TestSocks5Proxy:
assert response['auth_methods'] == [0x0] assert response['auth_methods'] == [0x0]
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks5_user_pass(self, handler, ctx): def test_socks5_user_pass(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler, auth=('test', 'testpass')) as server_address: with ctx.socks_server(Socks5ProxyHandler, auth=('test', 'testpass')) as server_address:
with handler() as rh: with handler() as rh:
@ -360,7 +384,7 @@ class TestSocks5Proxy:
assert response['auth_methods'] == [Socks5Auth.AUTH_NONE, Socks5Auth.AUTH_USER_PASS] assert response['auth_methods'] == [Socks5Auth.AUTH_NONE, Socks5Auth.AUTH_USER_PASS]
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks5_ipv4_target(self, handler, ctx): def test_socks5_ipv4_target(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
@ -368,7 +392,7 @@ class TestSocks5Proxy:
assert response['ipv4_address'] == '127.0.0.1' assert response['ipv4_address'] == '127.0.0.1'
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks5_domain_target(self, handler, ctx): def test_socks5_domain_target(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
@ -376,7 +400,7 @@ class TestSocks5Proxy:
assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1') assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1')
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks5h_domain_target(self, handler, ctx): def test_socks5h_domain_target(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: with handler(proxies={'all': f'socks5h://{server_address}'}) as rh:
@ -385,7 +409,7 @@ class TestSocks5Proxy:
assert response['domain_address'] == 'localhost' assert response['domain_address'] == 'localhost'
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks5h_ip_target(self, handler, ctx): def test_socks5h_ip_target(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5h://{server_address}'}) as rh: with handler(proxies={'all': f'socks5h://{server_address}'}) as rh:
@ -394,7 +418,7 @@ class TestSocks5Proxy:
assert response['domain_address'] is None assert response['domain_address'] is None
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_socks5_ipv6_destination(self, handler, ctx): def test_socks5_ipv6_destination(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
@ -402,7 +426,7 @@ class TestSocks5Proxy:
assert response['ipv6_address'] == '::1' assert response['ipv6_address'] == '::1'
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_ipv6_socks5_proxy(self, handler, ctx): def test_ipv6_socks5_proxy(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address: with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}) as rh: with handler(proxies={'all': f'socks5://{server_address}'}) as rh:
@ -413,7 +437,7 @@ class TestSocks5Proxy:
# XXX: is there any feasible way of testing IPv6 source addresses? # XXX: is there any feasible way of testing IPv6 source addresses?
# Same would go for non-proxy source_address test... # Same would go for non-proxy source_address test...
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
def test_ipv4_client_source_address(self, handler, ctx): def test_ipv4_client_source_address(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler) as server_address: with ctx.socks_server(Socks5ProxyHandler) as server_address:
source_address = f'127.0.0.{random.randint(5, 255)}' source_address = f'127.0.0.{random.randint(5, 255)}'
@ -422,7 +446,7 @@ class TestSocks5Proxy:
assert response['client_address'][0] == source_address assert response['client_address'][0] == source_address
assert response['version'] == 5 assert response['version'] == 5
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Requests', 'http'), ('Websockets', 'ws')], indirect=True)
@pytest.mark.parametrize('reply_code', [ @pytest.mark.parametrize('reply_code', [
Socks5Reply.GENERAL_FAILURE, Socks5Reply.GENERAL_FAILURE,
Socks5Reply.CONNECTION_NOT_ALLOWED, Socks5Reply.CONNECTION_NOT_ALLOWED,
@ -439,7 +463,7 @@ class TestSocks5Proxy:
with pytest.raises(ProxyError): with pytest.raises(ProxyError):
ctx.socks_info_request(rh) ctx.socks_info_request(rh)
@pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http'), ('Websockets', 'ws')], indirect=True)
def test_timeout(self, handler, ctx): def test_timeout(self, handler, ctx):
with ctx.socks_server(Socks5ProxyHandler, sleep=2) as server_address: with ctx.socks_server(Socks5ProxyHandler, sleep=2) as server_address:
with handler(proxies={'all': f'socks5://{server_address}'}, timeout=1) as rh: with handler(proxies={'all': f'socks5://{server_address}'}, timeout=1) as rh:

380
test/test_websockets.py Normal file
View File

@ -0,0 +1,380 @@
#!/usr/bin/env python3
# Allow direct execution
import os
import sys
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import http.client
import http.cookiejar
import http.server
import json
import random
import ssl
import threading
from yt_dlp import socks
from yt_dlp.cookies import YoutubeDLCookieJar
from yt_dlp.dependencies import websockets
from yt_dlp.networking import Request
from yt_dlp.networking.exceptions import (
CertificateVerifyError,
HTTPError,
ProxyError,
RequestError,
SSLError,
TransportError,
)
from yt_dlp.utils.networking import HTTPHeaderDict
from test.conftest import validate_and_send
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
def websocket_handler(websocket):
for message in websocket:
if isinstance(message, bytes):
if message == b'bytes':
return websocket.send('2')
elif isinstance(message, str):
if message == 'headers':
return websocket.send(json.dumps(dict(websocket.request.headers)))
elif message == 'path':
return websocket.send(websocket.request.path)
elif message == 'source_address':
return websocket.send(websocket.remote_address[0])
elif message == 'str':
return websocket.send('1')
return websocket.send(message)
def process_request(self, request):
if request.path.startswith('/gen_'):
status = http.HTTPStatus(int(request.path[5:]))
if 300 <= status.value <= 300:
return websockets.http11.Response(
status.value, status.phrase, websockets.datastructures.Headers([('Location', '/')]), b'')
return self.protocol.reject(status.value, status.phrase)
return self.protocol.accept(request)
def create_websocket_server(**ws_kwargs):
import websockets.sync.server
wsd = websockets.sync.server.serve(websocket_handler, '127.0.0.1', 0, process_request=process_request, **ws_kwargs)
ws_port = wsd.socket.getsockname()[1]
ws_server_thread = threading.Thread(target=wsd.serve_forever)
ws_server_thread.daemon = True
ws_server_thread.start()
return ws_server_thread, ws_port
def create_ws_websocket_server():
return create_websocket_server()
def create_wss_websocket_server():
certfn = os.path.join(TEST_DIR, 'testcert.pem')
sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
sslctx.load_cert_chain(certfn, None)
return create_websocket_server(ssl_context=sslctx)
MTLS_CERT_DIR = os.path.join(TEST_DIR, 'testdata', 'certificate')
def create_mtls_wss_websocket_server():
certfn = os.path.join(TEST_DIR, 'testcert.pem')
cacertfn = os.path.join(MTLS_CERT_DIR, 'ca.crt')
sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
sslctx.verify_mode = ssl.CERT_REQUIRED
sslctx.load_verify_locations(cafile=cacertfn)
sslctx.load_cert_chain(certfn, None)
return create_websocket_server(ssl_context=sslctx)
@pytest.mark.skipif(not websockets, reason='websockets must be installed to test websocket request handlers')
class TestWebsSocketRequestHandlerConformance:
@classmethod
def setup_class(cls):
cls.ws_thread, cls.ws_port = create_ws_websocket_server()
cls.ws_base_url = f'ws://127.0.0.1:{cls.ws_port}'
cls.wss_thread, cls.wss_port = create_wss_websocket_server()
cls.wss_base_url = f'wss://127.0.0.1:{cls.wss_port}'
cls.bad_wss_thread, cls.bad_wss_port = create_websocket_server(ssl_context=ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER))
cls.bad_wss_host = f'wss://127.0.0.1:{cls.bad_wss_port}'
cls.mtls_wss_thread, cls.mtls_wss_port = create_mtls_wss_websocket_server()
cls.mtls_wss_base_url = f'wss://127.0.0.1:{cls.mtls_wss_port}'
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_basic_websockets(self, handler):
with handler() as rh:
ws = validate_and_send(rh, Request(self.ws_base_url))
assert 'upgrade' in ws.headers
assert ws.status == 101
ws.send('foo')
assert ws.recv() == 'foo'
ws.close()
# https://www.rfc-editor.org/rfc/rfc6455.html#section-5.6
@pytest.mark.parametrize('msg,opcode', [('str', 1), (b'bytes', 2)])
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_send_types(self, handler, msg, opcode):
with handler() as rh:
ws = validate_and_send(rh, Request(self.ws_base_url))
ws.send(msg)
assert int(ws.recv()) == opcode
ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_verify_cert(self, handler):
with handler() as rh:
with pytest.raises(CertificateVerifyError):
validate_and_send(rh, Request(self.wss_base_url))
with handler(verify=False) as rh:
ws = validate_and_send(rh, Request(self.wss_base_url))
assert ws.status == 101
ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_ssl_error(self, handler):
with handler(verify=False) as rh:
with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info:
validate_and_send(rh, Request(self.bad_wss_host))
assert not issubclass(exc_info.type, CertificateVerifyError)
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
@pytest.mark.parametrize('path,expected', [
# Unicode characters should be encoded with uppercase percent-encoding
('/中文', '/%E4%B8%AD%E6%96%87'),
# don't normalize existing percent encodings
('/%c7%9f', '/%c7%9f'),
])
def test_percent_encode(self, handler, path, expected):
with handler() as rh:
ws = validate_and_send(rh, Request(f'{self.ws_base_url}{path}'))
ws.send('path')
assert ws.recv() == expected
assert ws.status == 101
ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_remove_dot_segments(self, handler):
with handler() as rh:
# This isn't a comprehensive test,
# but it should be enough to check whether the handler is removing dot segments
ws = validate_and_send(rh, Request(f'{self.ws_base_url}/a/b/./../../test'))
assert ws.status == 101
ws.send('path')
assert ws.recv() == '/test'
ws.close()
# We are restricted to known HTTP status codes in http.HTTPStatus
# Redirects are not supported for websockets
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
@pytest.mark.parametrize('status', (200, 204, 301, 302, 303, 400, 500, 511))
def test_raise_http_error(self, handler, status):
with handler() as rh:
with pytest.raises(HTTPError) as exc_info:
validate_and_send(rh, Request(f'{self.ws_base_url}/gen_{status}'))
assert exc_info.value.status == status
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
@pytest.mark.parametrize('params,extensions', [
({'timeout': 0.00001}, {}),
({}, {'timeout': 0.00001}),
])
def test_timeout(self, handler, params, extensions):
with handler(**params) as rh:
with pytest.raises(TransportError):
validate_and_send(rh, Request(self.ws_base_url, extensions=extensions))
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_cookies(self, handler):
cookiejar = YoutubeDLCookieJar()
cookiejar.set_cookie(http.cookiejar.Cookie(
version=0, name='test', value='ytdlp', port=None, port_specified=False,
domain='127.0.0.1', domain_specified=True, domain_initial_dot=False, path='/',
path_specified=True, secure=False, expires=None, discard=False, comment=None,
comment_url=None, rest={}))
with handler(cookiejar=cookiejar) as rh:
ws = validate_and_send(rh, Request(self.ws_base_url))
ws.send('headers')
assert json.loads(ws.recv())['cookie'] == 'test=ytdlp'
ws.close()
with handler() as rh:
ws = validate_and_send(rh, Request(self.ws_base_url))
ws.send('headers')
assert 'cookie' not in json.loads(ws.recv())
ws.close()
ws = validate_and_send(rh, Request(self.ws_base_url, extensions={'cookiejar': cookiejar}))
ws.send('headers')
assert json.loads(ws.recv())['cookie'] == 'test=ytdlp'
ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_source_address(self, handler):
source_address = f'127.0.0.{random.randint(5, 255)}'
with handler(source_address=source_address) as rh:
ws = validate_and_send(rh, Request(self.ws_base_url))
ws.send('source_address')
assert source_address == ws.recv()
ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_response_url(self, handler):
with handler() as rh:
url = f'{self.ws_base_url}/something'
ws = validate_and_send(rh, Request(url))
assert ws.url == url
ws.close()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_request_headers(self, handler):
with handler(headers=HTTPHeaderDict({'test1': 'test', 'test2': 'test2'})) as rh:
# Global Headers
ws = validate_and_send(rh, Request(self.ws_base_url))
ws.send('headers')
headers = HTTPHeaderDict(json.loads(ws.recv()))
assert headers['test1'] == 'test'
ws.close()
# Per request headers, merged with global
ws = validate_and_send(rh, Request(
self.ws_base_url, headers={'test2': 'changed', 'test3': 'test3'}))
ws.send('headers')
headers = HTTPHeaderDict(json.loads(ws.recv()))
assert headers['test1'] == 'test'
assert headers['test2'] == 'changed'
assert headers['test3'] == 'test3'
ws.close()
@pytest.mark.parametrize('client_cert', (
{'client_certificate': os.path.join(MTLS_CERT_DIR, 'clientwithkey.crt')},
{
'client_certificate': os.path.join(MTLS_CERT_DIR, 'client.crt'),
'client_certificate_key': os.path.join(MTLS_CERT_DIR, 'client.key'),
},
{
'client_certificate': os.path.join(MTLS_CERT_DIR, 'clientwithencryptedkey.crt'),
'client_certificate_password': 'foobar',
},
{
'client_certificate': os.path.join(MTLS_CERT_DIR, 'client.crt'),
'client_certificate_key': os.path.join(MTLS_CERT_DIR, 'clientencrypted.key'),
'client_certificate_password': 'foobar',
}
))
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_mtls(self, handler, client_cert):
with handler(
# Disable client-side validation of unacceptable self-signed testcert.pem
# The test is of a check on the server side, so unaffected
verify=False,
client_cert=client_cert
) as rh:
validate_and_send(rh, Request(self.mtls_wss_base_url)).close()
def create_fake_ws_connection(raised):
import websockets.sync.client
class FakeWsConnection(websockets.sync.client.ClientConnection):
def __init__(self, *args, **kwargs):
class FakeResponse:
body = b''
headers = {}
status_code = 101
reason_phrase = 'test'
self.response = FakeResponse()
def send(self, *args, **kwargs):
raise raised()
def recv(self, *args, **kwargs):
raise raised()
def close(self, *args, **kwargs):
return
return FakeWsConnection()
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
class TestWebsocketsRequestHandler:
@pytest.mark.parametrize('raised,expected', [
# https://websockets.readthedocs.io/en/stable/reference/exceptions.html
(lambda: websockets.exceptions.InvalidURI(msg='test', uri='test://'), RequestError),
# Requires a response object. Should be covered by HTTP error tests.
# (lambda: websockets.exceptions.InvalidStatus(), TransportError),
(lambda: websockets.exceptions.InvalidHandshake(), TransportError),
# These are subclasses of InvalidHandshake
(lambda: websockets.exceptions.InvalidHeader(name='test'), TransportError),
(lambda: websockets.exceptions.NegotiationError(), TransportError),
# Catch-all
(lambda: websockets.exceptions.WebSocketException(), TransportError),
(lambda: TimeoutError(), TransportError),
# These may be raised by our create_connection implementation, which should also be caught
(lambda: OSError(), TransportError),
(lambda: ssl.SSLError(), SSLError),
(lambda: ssl.SSLCertVerificationError(), CertificateVerifyError),
(lambda: socks.ProxyError(), ProxyError),
])
def test_request_error_mapping(self, handler, monkeypatch, raised, expected):
import websockets.sync.client
import yt_dlp.networking._websockets
with handler() as rh:
def fake_connect(*args, **kwargs):
raise raised()
monkeypatch.setattr(yt_dlp.networking._websockets, 'create_connection', lambda *args, **kwargs: None)
monkeypatch.setattr(websockets.sync.client, 'connect', fake_connect)
with pytest.raises(expected) as exc_info:
rh.send(Request('ws://fake-url'))
assert exc_info.type is expected
@pytest.mark.parametrize('raised,expected,match', [
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.send
(lambda: websockets.exceptions.ConnectionClosed(None, None), TransportError, None),
(lambda: RuntimeError(), TransportError, None),
(lambda: TimeoutError(), TransportError, None),
(lambda: TypeError(), RequestError, None),
(lambda: socks.ProxyError(), ProxyError, None),
# Catch-all
(lambda: websockets.exceptions.WebSocketException(), TransportError, None),
])
def test_ws_send_error_mapping(self, handler, monkeypatch, raised, expected, match):
from yt_dlp.networking._websockets import WebsocketsResponseAdapter
ws = WebsocketsResponseAdapter(create_fake_ws_connection(raised), url='ws://fake-url')
with pytest.raises(expected, match=match) as exc_info:
ws.send('test')
assert exc_info.type is expected
@pytest.mark.parametrize('raised,expected,match', [
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.recv
(lambda: websockets.exceptions.ConnectionClosed(None, None), TransportError, None),
(lambda: RuntimeError(), TransportError, None),
(lambda: TimeoutError(), TransportError, None),
(lambda: socks.ProxyError(), ProxyError, None),
# Catch-all
(lambda: websockets.exceptions.WebSocketException(), TransportError, None),
])
def test_ws_recv_error_mapping(self, handler, monkeypatch, raised, expected, match):
from yt_dlp.networking._websockets import WebsocketsResponseAdapter
ws = WebsocketsResponseAdapter(create_fake_ws_connection(raised), url='ws://fake-url')
with pytest.raises(expected, match=match) as exc_info:
ws.recv()
assert exc_info.type is expected

View File

@ -625,13 +625,16 @@ class YoutubeDL:
'Overwriting params from "color" with "no_color"') 'Overwriting params from "color" with "no_color"')
self.params['color'] = 'no_color' self.params['color'] = 'no_color'
term_allow_color = os.environ.get('TERM', '').lower() != 'dumb' term_allow_color = os.getenv('TERM', '').lower() != 'dumb'
no_color = bool(os.getenv('NO_COLOR'))
def process_color_policy(stream): def process_color_policy(stream):
stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream] stream_name = {sys.stdout: 'stdout', sys.stderr: 'stderr'}[stream]
policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False) policy = traverse_obj(self.params, ('color', (stream_name, None), {str}), get_all=False)
if policy in ('auto', None): if policy in ('auto', None):
return term_allow_color and supports_terminal_sequences(stream) if term_allow_color and supports_terminal_sequences(stream):
return 'no_color' if no_color else True
return False
assert policy in ('always', 'never', 'no_color'), policy assert policy in ('always', 'never', 'no_color'), policy
return {'always': True, 'never': False}.get(policy, policy) return {'always': True, 'never': False}.get(policy, policy)
@ -4052,6 +4055,7 @@ class YoutubeDL:
return self._request_director.send(req) return self._request_director.send(req)
except NoSupportingHandlers as e: except NoSupportingHandlers as e:
for ue in e.unsupported_errors: for ue in e.unsupported_errors:
# FIXME: This depends on the order of errors.
if not (ue.handler and ue.msg): if not (ue.handler and ue.msg):
continue continue
if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower(): if ue.handler.RH_KEY == 'Urllib' and 'unsupported url scheme: "file"' in ue.msg.lower():
@ -4061,6 +4065,15 @@ class YoutubeDL:
if 'unsupported proxy type: "https"' in ue.msg.lower(): if 'unsupported proxy type: "https"' in ue.msg.lower():
raise RequestError( raise RequestError(
'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests') 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests')
elif (
re.match(r'unsupported url scheme: "wss?"', ue.msg.lower())
and 'websockets' not in self._request_director.handlers
):
raise RequestError(
'This request requires WebSocket support. '
'Ensure one of the following dependencies are installed: websockets',
cause=ue) from ue
raise raise
except SSLError as e: except SSLError as e:
if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e): if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e):

View File

@ -6,7 +6,7 @@ from . import get_suitable_downloader
from .common import FileDownloader from .common import FileDownloader
from .external import FFmpegFD from .external import FFmpegFD
from ..networking import Request from ..networking import Request
from ..utils import DownloadError, WebSocketsWrapper, str_or_none, try_get from ..utils import DownloadError, str_or_none, try_get
class NiconicoDmcFD(FileDownloader): class NiconicoDmcFD(FileDownloader):
@ -64,7 +64,6 @@ class NiconicoLiveFD(FileDownloader):
ws_url = info_dict['url'] ws_url = info_dict['url']
ws_extractor = info_dict['ws'] ws_extractor = info_dict['ws']
ws_origin_host = info_dict['origin'] ws_origin_host = info_dict['origin']
cookies = info_dict.get('cookies')
live_quality = info_dict.get('live_quality', 'high') live_quality = info_dict.get('live_quality', 'high')
live_latency = info_dict.get('live_latency', 'high') live_latency = info_dict.get('live_latency', 'high')
dl = FFmpegFD(self.ydl, self.params or {}) dl = FFmpegFD(self.ydl, self.params or {})
@ -76,12 +75,7 @@ class NiconicoLiveFD(FileDownloader):
def communicate_ws(reconnect): def communicate_ws(reconnect):
if reconnect: if reconnect:
ws = WebSocketsWrapper(ws_url, { ws = self.ydl.urlopen(Request(ws_url, headers={'Origin': f'https://{ws_origin_host}'}))
'Cookies': str_or_none(cookies) or '',
'Origin': f'https://{ws_origin_host}',
'Accept': '*/*',
'User-Agent': self.params['http_headers']['User-Agent'],
})
if self.ydl.params.get('verbose', False): if self.ydl.params.get('verbose', False):
self.to_screen('[debug] Sending startWatching request') self.to_screen('[debug] Sending startWatching request')
ws.send(json.dumps({ ws.send(json.dumps({

View File

@ -81,16 +81,20 @@ from .airmozilla import AirMozillaIE
from .airtv import AirTVIE from .airtv import AirTVIE
from .aitube import AitubeKZVideoIE from .aitube import AitubeKZVideoIE
from .aljazeera import AlJazeeraIE from .aljazeera import AlJazeeraIE
from .allstar import (
AllstarIE,
AllstarProfileIE,
)
from .alphaporno import AlphaPornoIE from .alphaporno import AlphaPornoIE
from .altcensored import ( from .altcensored import (
AltCensoredIE, AltCensoredIE,
AltCensoredChannelIE, AltCensoredChannelIE,
) )
from .amara import AmaraIE
from .alura import ( from .alura import (
AluraIE, AluraIE,
AluraCourseIE AluraCourseIE
) )
from .amara import AmaraIE
from .amcnetworks import AMCNetworksIE from .amcnetworks import AMCNetworksIE
from .amazon import ( from .amazon import (
AmazonStoreIE, AmazonStoreIE,
@ -216,6 +220,8 @@ from .bilibili import (
BiliBiliBangumiIE, BiliBiliBangumiIE,
BiliBiliBangumiSeasonIE, BiliBiliBangumiSeasonIE,
BiliBiliBangumiMediaIE, BiliBiliBangumiMediaIE,
BilibiliCheeseIE,
BilibiliCheeseSeasonIE,
BiliBiliSearchIE, BiliBiliSearchIE,
BilibiliCategoryIE, BilibiliCategoryIE,
BilibiliAudioIE, BilibiliAudioIE,
@ -1241,6 +1247,7 @@ from .ndr import (
from .ndtv import NDTVIE from .ndtv import NDTVIE
from .nebula import ( from .nebula import (
NebulaIE, NebulaIE,
NebulaClassIE,
NebulaSubscriptionsIE, NebulaSubscriptionsIE,
NebulaChannelIE, NebulaChannelIE,
) )
@ -2029,6 +2036,10 @@ from .tenplay import (
from .testurl import TestURLIE from .testurl import TestURLIE
from .tf1 import TF1IE from .tf1 import TF1IE
from .tfo import TFOIE from .tfo import TFOIE
from .theguardian import (
TheGuardianPodcastIE,
TheGuardianPodcastPlaylistIE,
)
from .theholetv import TheHoleTvIE from .theholetv import TheHoleTvIE
from .theintercept import TheInterceptIE from .theintercept import TheInterceptIE
from .theplatform import ( from .theplatform import (
@ -2301,6 +2312,7 @@ from .vidio import (
VidioLiveIE VidioLiveIE
) )
from .vidlii import VidLiiIE from .vidlii import VidLiiIE
from .vidly import VidlyIE
from .viewlift import ( from .viewlift import (
ViewLiftIE, ViewLiftIE,
ViewLiftEmbedIE, ViewLiftEmbedIE,

View File

@ -211,7 +211,8 @@ class AbemaTVIE(AbemaTVBaseIE):
'id': '194-25_s2_p1', 'id': '194-25_s2_p1',
'title': '第1話 「チーズケーキ」 「モーニング再び」', 'title': '第1話 「チーズケーキ」 「モーニング再び」',
'series': '異世界食堂2', 'series': '異世界食堂2',
'series_number': 2, 'season': 'シーズン2',
'season_number': 2,
'episode': '第1話 「チーズケーキ」 「モーニング再び」', 'episode': '第1話 「チーズケーキ」 「モーニング再び」',
'episode_number': 1, 'episode_number': 1,
}, },
@ -347,12 +348,12 @@ class AbemaTVIE(AbemaTVBaseIE):
)? )?
''', r'\1', og_desc) ''', r'\1', og_desc)
# canonical URL may contain series and episode number # canonical URL may contain season and episode number
mobj = re.search(r's(\d+)_p(\d+)$', canonical_url) mobj = re.search(r's(\d+)_p(\d+)$', canonical_url)
if mobj: if mobj:
seri = int_or_none(mobj.group(1), default=float('inf')) seri = int_or_none(mobj.group(1), default=float('inf'))
epis = int_or_none(mobj.group(2), default=float('inf')) epis = int_or_none(mobj.group(2), default=float('inf'))
info['series_number'] = seri if seri < 100 else None info['season_number'] = seri if seri < 100 else None
# some anime like Detective Conan (though not available in AbemaTV) # some anime like Detective Conan (though not available in AbemaTV)
# has more than 1000 episodes (1026 as of 2021/11/15) # has more than 1000 episodes (1026 as of 2021/11/15)
info['episode_number'] = epis if epis < 2000 else None info['episode_number'] = epis if epis < 2000 else None
@ -381,7 +382,7 @@ class AbemaTVIE(AbemaTVBaseIE):
self.report_warning('This is a premium-only stream') self.report_warning('This is a premium-only stream')
info.update(traverse_obj(api_response, { info.update(traverse_obj(api_response, {
'series': ('series', 'title'), 'series': ('series', 'title'),
'season': ('season', 'title'), 'season': ('season', 'name'),
'season_number': ('season', 'sequence'), 'season_number': ('season', 'sequence'),
'episode_number': ('episode', 'number'), 'episode_number': ('episode', 'number'),
})) }))

253
yt_dlp/extractor/allstar.py Normal file
View File

@ -0,0 +1,253 @@
import functools
import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
int_or_none,
join_nonempty,
parse_qs,
urljoin,
)
from ..utils.traversal import traverse_obj
_FIELDS = '''
_id
clipImageSource
clipImageThumb
clipLink
clipTitle
createdDate
shareId
user { _id }
username
views'''
_EXTRA_FIELDS = '''
clipLength
clipSizeBytes'''
_QUERIES = {
'clip': '''query ($id: String!) {
video: getClip(clipIdentifier: $id) {
%s %s
}
}''' % (_FIELDS, _EXTRA_FIELDS),
'montage': '''query ($id: String!) {
video: getMontage(clipIdentifier: $id) {
%s
}
}''' % _FIELDS,
'Clips': '''query ($page: Int!, $user: String!, $game: Int) {
videos: clips(search: createdDate, page: $page, user: $user, mobile: false, game: $game) {
data { %s %s }
}
}''' % (_FIELDS, _EXTRA_FIELDS),
'Montages': '''query ($page: Int!, $user: String!) {
videos: montages(search: createdDate, page: $page, user: $user) {
data { %s }
}
}''' % _FIELDS,
'Mobile Clips': '''query ($page: Int!, $user: String!) {
videos: clips(search: createdDate, page: $page, user: $user, mobile: true) {
data { %s %s }
}
}''' % (_FIELDS, _EXTRA_FIELDS),
}
class AllstarBaseIE(InfoExtractor):
@staticmethod
def _parse_video_data(video_data):
def media_url_or_none(path):
return urljoin('https://media.allstar.gg/', path)
info = traverse_obj(video_data, {
'id': ('_id', {str}),
'display_id': ('shareId', {str}),
'title': ('clipTitle', {str}),
'url': ('clipLink', {media_url_or_none}),
'thumbnails': (('clipImageThumb', 'clipImageSource'), {'url': {media_url_or_none}}),
'duration': ('clipLength', {int_or_none}),
'filesize': ('clipSizeBytes', {int_or_none}),
'timestamp': ('createdDate', {functools.partial(int_or_none, scale=1000)}),
'uploader': ('username', {str}),
'uploader_id': ('user', '_id', {str}),
'view_count': ('views', {int_or_none}),
})
if info.get('id') and info.get('url'):
basename = 'clip' if '/clips/' in info['url'] else 'montage'
info['webpage_url'] = f'https://allstar.gg/{basename}?{basename}={info["id"]}'
info.update({
'extractor_key': AllstarIE.ie_key(),
'extractor': AllstarIE.IE_NAME,
'uploader_url': urljoin('https://allstar.gg/u/', info.get('uploader_id')),
})
return info
def _call_api(self, query, variables, path, video_id=None, note=None):
response = self._download_json(
'https://a1.allstar.gg/graphql', video_id, note=note,
headers={'content-type': 'application/json'},
data=json.dumps({'variables': variables, 'query': query}).encode())
errors = traverse_obj(response, ('errors', ..., 'message', {str}))
if errors:
raise ExtractorError('; '.join(errors))
return traverse_obj(response, path)
class AllstarIE(AllstarBaseIE):
_VALID_URL = r'https?://(?:www\.)?allstar\.gg/(?P<type>(?:clip|montage))\?(?P=type)=(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://allstar.gg/clip?clip=64482c2da9eec30008a67d1b',
'info_dict': {
'id': '64482c2da9eec30008a67d1b',
'title': '4K on Inferno',
'url': 'md5:66befb5381eef0c9456026386c25fa55',
'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$',
'uploader': 'chrk.',
'ext': 'mp4',
'duration': 20,
'filesize': 21199257,
'timestamp': 1682451501,
'uploader_id': '62b8bdfc9021052f7905882d',
'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d',
'upload_date': '20230425',
'view_count': int,
}
}, {
'url': 'https://allstar.gg/clip?clip=8LJLY4JKB',
'info_dict': {
'id': '64a1ec6b887f4c0008dc50b8',
'display_id': '8LJLY4JKB',
'title': 'AK-47 3K on Mirage',
'url': 'md5:dde224fd12f035c0e2529a4ae34c4283',
'ext': 'mp4',
'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$',
'duration': 16,
'filesize': 30175859,
'timestamp': 1688333419,
'uploader': 'cherokee',
'uploader_id': '62b8bdfc9021052f7905882d',
'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d',
'upload_date': '20230702',
'view_count': int,
}
}, {
'url': 'https://allstar.gg/montage?montage=643e64089da7e9363e1fa66c',
'info_dict': {
'id': '643e64089da7e9363e1fa66c',
'display_id': 'APQLGM2IMXW',
'title': 'cherokee Rapid Fire Snipers Montage',
'url': 'md5:a3ee356022115db2b27c81321d195945',
'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$',
'ext': 'mp4',
'timestamp': 1681810448,
'uploader': 'cherokee',
'uploader_id': '62b8bdfc9021052f7905882d',
'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d',
'upload_date': '20230418',
'view_count': int,
}
}, {
'url': 'https://allstar.gg/montage?montage=RILJMH6QOS',
'info_dict': {
'id': '64a2697372ce3703de29e868',
'display_id': 'RILJMH6QOS',
'title': 'cherokee Rapid Fire Snipers Montage',
'url': 'md5:d5672e6f88579730c2310a80fdbc4030',
'thumbnail': r're:https://media\.allstar\.gg/.+\.(?:png|jpg)$',
'ext': 'mp4',
'timestamp': 1688365434,
'uploader': 'cherokee',
'uploader_id': '62b8bdfc9021052f7905882d',
'uploader_url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d',
'upload_date': '20230703',
'view_count': int,
}
}]
def _real_extract(self, url):
query_id, video_id = self._match_valid_url(url).group('type', 'id')
return self._parse_video_data(
self._call_api(
_QUERIES.get(query_id), {'id': video_id}, ('data', 'video'), video_id))
class AllstarProfileIE(AllstarBaseIE):
_VALID_URL = r'https?://(?:www\.)?allstar\.gg/(?:profile\?user=|u/)(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://allstar.gg/profile?user=62b8bdfc9021052f7905882d',
'info_dict': {
'id': '62b8bdfc9021052f7905882d-clips',
'title': 'cherokee - Clips',
},
'playlist_mincount': 15
}, {
'url': 'https://allstar.gg/u/cherokee?game=730&view=Clips',
'info_dict': {
'id': '62b8bdfc9021052f7905882d-clips-730',
'title': 'cherokee - Clips - 730',
},
'playlist_mincount': 15
}, {
'url': 'https://allstar.gg/u/62b8bdfc9021052f7905882d?view=Montages',
'info_dict': {
'id': '62b8bdfc9021052f7905882d-montages',
'title': 'cherokee - Montages',
},
'playlist_mincount': 4
}, {
'url': 'https://allstar.gg/profile?user=cherokee&view=Mobile Clips',
'info_dict': {
'id': '62b8bdfc9021052f7905882d-mobile',
'title': 'cherokee - Mobile Clips',
},
'playlist_mincount': 1
}]
_PAGE_SIZE = 10
def _get_page(self, user_id, display_id, game, query, page_num):
page_num += 1
for video_data in self._call_api(
query, {
'user': user_id,
'page': page_num,
'game': game,
}, ('data', 'videos', 'data'), display_id, f'Downloading page {page_num}'):
yield self._parse_video_data(video_data)
def _real_extract(self, url):
display_id = self._match_id(url)
profile_data = self._download_json(
urljoin('https://api.allstar.gg/v1/users/profile/', display_id), display_id)
user_id = traverse_obj(profile_data, ('data', ('_id'), {str}))
if not user_id:
raise ExtractorError('Unable to extract the user id')
username = traverse_obj(profile_data, ('data', 'profile', ('username'), {str}))
url_query = parse_qs(url)
game = traverse_obj(url_query, ('game', 0, {int_or_none}))
query_id = traverse_obj(url_query, ('view', 0), default='Clips')
if query_id not in ('Clips', 'Montages', 'Mobile Clips'):
raise ExtractorError(f'Unsupported playlist URL type {query_id!r}')
return self.playlist_result(
OnDemandPagedList(
functools.partial(
self._get_page, user_id, display_id, game, _QUERIES.get(query_id)), self._PAGE_SIZE),
playlist_id=join_nonempty(user_id, query_id.lower().split()[0], game),
playlist_title=join_nonempty((username or display_id), query_id, game, delim=' - '))

View File

@ -2,6 +2,7 @@ import base64
import functools import functools
import hashlib import hashlib
import itertools import itertools
import json
import math import math
import re import re
import time import time
@ -16,9 +17,11 @@ from ..utils import (
InAdvancePagedList, InAdvancePagedList,
OnDemandPagedList, OnDemandPagedList,
bool_or_none, bool_or_none,
clean_html,
filter_dict, filter_dict,
float_or_none, float_or_none,
format_field, format_field,
get_element_by_class,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
make_archive_id, make_archive_id,
@ -88,6 +91,12 @@ class BilibiliBaseIE(InfoExtractor):
return formats return formats
def _download_playinfo(self, video_id, cid):
return self._download_json(
'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
note=f'Downloading video formats for cid {cid}')['data']
def json2srt(self, json_data): def json2srt(self, json_data):
srt_data = '' srt_data = ''
for idx, line in enumerate(json_data.get('body') or []): for idx, line in enumerate(json_data.get('body') or []):
@ -96,7 +105,7 @@ class BilibiliBaseIE(InfoExtractor):
f'{line["content"]}\n\n') f'{line["content"]}\n\n')
return srt_data return srt_data
def _get_subtitles(self, video_id, aid, cid): def _get_subtitles(self, video_id, cid, aid=None):
subtitles = { subtitles = {
'danmaku': [{ 'danmaku': [{
'ext': 'xml', 'ext': 'xml',
@ -104,8 +113,15 @@ class BilibiliBaseIE(InfoExtractor):
}] }]
} }
video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id) subtitle_info = traverse_obj(self._download_json(
for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)): 'https://api.bilibili.com/x/player/v2', video_id,
query={'aid': aid, 'cid': cid} if aid else {'bvid': video_id, 'cid': cid},
note=f'Extracting subtitle info {cid}'), ('data', 'subtitle'))
subs_list = traverse_obj(subtitle_info, ('subtitles', lambda _, v: v['subtitle_url'] and v['lan']))
if not subs_list and traverse_obj(subtitle_info, 'allow_submit'):
if not self._get_cookies('https://api.bilibili.com').get('SESSDATA'): # no login session cookie
self.report_warning(f'CC subtitles (if any) are only visible when logged in. {self._login_hint()}', only_once=True)
for s in subs_list:
subtitles.setdefault(s['lan'], []).append({ subtitles.setdefault(s['lan'], []).append({
'ext': 'srt', 'ext': 'srt',
'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id))
@ -155,7 +171,54 @@ class BilibiliBaseIE(InfoExtractor):
for entry in traverse_obj(season_info, ( for entry in traverse_obj(season_info, (
'result', 'main_section', 'episodes', 'result', 'main_section', 'episodes',
lambda _, v: url_or_none(v['share_url']) and v['id'])): lambda _, v: url_or_none(v['share_url']) and v['id'])):
yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') yield self.url_result(entry['share_url'], BiliBiliBangumiIE, str_or_none(entry.get('id')))
def _get_divisions(self, video_id, graph_version, edges, edge_id, cid_edges=None):
cid_edges = cid_edges or {}
division_data = self._download_json(
'https://api.bilibili.com/x/stein/edgeinfo_v2', video_id,
query={'graph_version': graph_version, 'edge_id': edge_id, 'bvid': video_id},
note=f'Extracting divisions from edge {edge_id}')
edges.setdefault(edge_id, {}).update(
traverse_obj(division_data, ('data', 'story_list', lambda _, v: v['edge_id'] == edge_id, {
'title': ('title', {str}),
'cid': ('cid', {int_or_none}),
}), get_all=False))
edges[edge_id].update(traverse_obj(division_data, ('data', {
'title': ('title', {str}),
'choices': ('edges', 'questions', ..., 'choices', ..., {
'edge_id': ('id', {int_or_none}),
'cid': ('cid', {int_or_none}),
'text': ('option', {str}),
}),
})))
# use dict to combine edges that use the same video section (same cid)
cid_edges.setdefault(edges[edge_id]['cid'], {})[edge_id] = edges[edge_id]
for choice in traverse_obj(edges, (edge_id, 'choices', ...)):
if choice['edge_id'] not in edges:
edges[choice['edge_id']] = {'cid': choice['cid']}
self._get_divisions(video_id, graph_version, edges, choice['edge_id'], cid_edges=cid_edges)
return cid_edges
def _get_interactive_entries(self, video_id, cid, metainfo):
graph_version = traverse_obj(
self._download_json(
'https://api.bilibili.com/x/player/wbi/v2', video_id,
'Extracting graph version', query={'bvid': video_id, 'cid': cid}),
('data', 'interaction', 'graph_version', {int_or_none}))
cid_edges = self._get_divisions(video_id, graph_version, {1: {'cid': cid}}, 1)
for cid, edges in cid_edges.items():
play_info = self._download_playinfo(video_id, cid)
yield {
**metainfo,
'id': f'{video_id}_{cid}',
'title': f'{metainfo.get("title")} - {list(edges.values())[0].get("title")}',
'formats': self.extract_formats(play_info),
'description': f'{json.dumps(edges, ensure_ascii=False)}\n{metainfo.get("description", "")}',
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'subtitles': self.extract_subtitles(video_id, cid),
}
class BiliBiliIE(BilibiliBaseIE): class BiliBiliIE(BilibiliBaseIE):
@ -180,7 +243,7 @@ class BiliBiliIE(BilibiliBaseIE):
'view_count': int, 'view_count': int,
}, },
}, { }, {
# old av URL version 'note': 'old av URL version',
'url': 'http://www.bilibili.com/video/av1074402/', 'url': 'http://www.bilibili.com/video/av1074402/',
'info_dict': { 'info_dict': {
'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$',
@ -212,7 +275,7 @@ class BiliBiliIE(BilibiliBaseIE):
'id': 'BV1bK411W797_p1', 'id': 'BV1bK411W797_p1',
'ext': 'mp4', 'ext': 'mp4',
'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
'tags': 'count:11', 'tags': 'count:10',
'timestamp': 1589601697, 'timestamp': 1589601697,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'uploader': '打牌还是打桩', 'uploader': '打牌还是打桩',
@ -232,7 +295,7 @@ class BiliBiliIE(BilibiliBaseIE):
'id': 'BV1bK411W797_p1', 'id': 'BV1bK411W797_p1',
'ext': 'mp4', 'ext': 'mp4',
'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川',
'tags': 'count:11', 'tags': 'count:10',
'timestamp': 1589601697, 'timestamp': 1589601697,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'uploader': '打牌还是打桩', 'uploader': '打牌还是打桩',
@ -343,18 +406,120 @@ class BiliBiliIE(BilibiliBaseIE):
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, {
'note': 'interactive/split-path video',
'url': 'https://www.bilibili.com/video/BV1af4y1H7ga/',
'info_dict': {
'id': 'BV1af4y1H7ga',
'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!!',
'timestamp': 1630500414,
'upload_date': '20210901',
'description': 'md5:01113e39ab06e28042d74ac356a08786',
'tags': list,
'uploader': '钉宫妮妮Ninico',
'duration': 1503,
'uploader_id': '8881297',
'comment_count': int,
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
'playlist_count': 33,
'playlist': [{
'info_dict': {
'id': 'BV1af4y1H7ga_400950101',
'ext': 'mp4',
'title': '【互动游戏】花了大半年时间做的自我介绍~请查收!! - 听见猫猫叫~',
'timestamp': 1630500414,
'upload_date': '20210901',
'description': 'md5:db66ac7a2813a94b8291dbce990cc5b2',
'tags': list,
'uploader': '钉宫妮妮Ninico',
'duration': 11.605,
'uploader_id': '8881297',
'comment_count': int,
'view_count': int,
'like_count': int,
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
}],
}, {
'note': '301 redirect to bangumi link',
'url': 'https://www.bilibili.com/video/BV1TE411f7f1',
'info_dict': {
'id': '288525',
'title': '李永乐老师 钱学森弹道和乘波体飞行器是什么?',
'ext': 'mp4',
'series': '我和我的祖国',
'series_id': '4780',
'season': '幕后纪实',
'season_id': '28609',
'season_number': 1,
'episode': '钱学森弹道和乘波体飞行器是什么?',
'episode_id': '288525',
'episode_number': 105,
'duration': 1183.957,
'timestamp': 1571648124,
'upload_date': '20191021',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
}, {
'url': 'https://www.bilibili.com/video/BV1jL41167ZG/',
'info_dict': {
'id': 'BV1jL41167ZG',
'title': '一场大火引发的离奇死亡!古典推理经典短篇集《不可能犯罪诊断书》!',
'ext': 'mp4',
},
'skip': 'supporter-only video',
}, {
'url': 'https://www.bilibili.com/video/BV1Ks411f7aQ/',
'info_dict': {
'id': 'BV1Ks411f7aQ',
'title': '【BD1080P】狼与香辛料I【华盟】',
'ext': 'mp4',
},
'skip': 'login required',
}, {
'url': 'https://www.bilibili.com/video/BV1GJ411x7h7/',
'info_dict': {
'id': 'BV1GJ411x7h7',
'title': '【官方 MV】Never Gonna Give You Up - Rick Astley',
'ext': 'mp4',
},
'skip': 'geo-restricted',
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage, urlh = self._download_webpage_handle(url, video_id)
if not self._match_valid_url(urlh.url):
return self.url_result(urlh.url)
initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)
is_festival = 'videoData' not in initial_state is_festival = 'videoData' not in initial_state
if is_festival: if is_festival:
video_data = initial_state['videoInfo'] video_data = initial_state['videoInfo']
else: else:
play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] play_info_obj = self._search_json(
r'window\.__playinfo__\s*=', webpage, 'play info', video_id, fatal=False)
if not play_info_obj:
if traverse_obj(initial_state, ('error', 'trueCode')) == -403:
self.raise_login_required()
if traverse_obj(initial_state, ('error', 'trueCode')) == -404:
raise ExtractorError(
'This video may be deleted or geo-restricted. '
'You might want to try a VPN or a proxy server (with --proxy)', expected=True)
play_info = traverse_obj(play_info_obj, ('data', {dict}))
if not play_info:
if traverse_obj(play_info_obj, 'code') == 87007:
toast = get_element_by_class('tips-toast', webpage) or ''
msg = clean_html(
f'{get_element_by_class("belongs-to", toast) or ""}'
+ (get_element_by_class('level', toast) or ''))
raise ExtractorError(
f'This is a supporter-only video: {msg}. {self._login_hint()}', expected=True)
raise ExtractorError('Failed to extract play info')
video_data = initial_state['videoData'] video_data = initial_state['videoData']
video_id, title = video_data['bvid'], video_data.get('title') video_id, title = video_data['bvid'], video_data.get('title')
@ -385,10 +550,7 @@ class BiliBiliIE(BilibiliBaseIE):
festival_info = {} festival_info = {}
if is_festival: if is_festival:
play_info = self._download_json( play_info = self._download_playinfo(video_id, cid)
'https://api.bilibili.com/x/player/playurl', video_id,
query={'bvid': video_id, 'cid': cid, 'fnval': 4048},
note='Extracting festival video formats')['data']
festival_info = traverse_obj(initial_state, { festival_info = traverse_obj(initial_state, {
'uploader': ('videoInfo', 'upName'), 'uploader': ('videoInfo', 'upName'),
@ -397,7 +559,7 @@ class BiliBiliIE(BilibiliBaseIE):
'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'),
}, get_all=False) }, get_all=False)
return { metainfo = {
**traverse_obj(initial_state, { **traverse_obj(initial_state, {
'uploader': ('upData', 'name'), 'uploader': ('upData', 'name'),
'uploader_id': ('upData', 'mid', {str_or_none}), 'uploader_id': ('upData', 'mid', {str_or_none}),
@ -413,28 +575,59 @@ class BiliBiliIE(BilibiliBaseIE):
'comment_count': ('stat', 'reply', {int_or_none}), 'comment_count': ('stat', 'reply', {int_or_none}),
}, get_all=False), }, get_all=False),
'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'id': f'{video_id}{format_field(part_id, None, "_p%d")}',
'formats': self.extract_formats(play_info),
'_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None,
'title': title, 'title': title,
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, aid, cid),
'__post_extractor': self.extract_comments(aid),
'http_headers': {'Referer': url}, 'http_headers': {'Referer': url},
} }
is_interactive = traverse_obj(video_data, ('rights', 'is_stein_gate'))
if is_interactive:
return self.playlist_result(
self._get_interactive_entries(video_id, cid, metainfo), **metainfo, **{
'duration': traverse_obj(initial_state, ('videoData', 'duration', {int_or_none})),
'__post_extractor': self.extract_comments(aid),
})
else:
return {
**metainfo,
'duration': float_or_none(play_info.get('timelength'), scale=1000),
'chapters': self._get_chapters(aid, cid),
'subtitles': self.extract_subtitles(video_id, cid),
'formats': self.extract_formats(play_info),
'__post_extractor': self.extract_comments(aid),
}
class BiliBiliBangumiIE(BilibiliBaseIE): class BiliBiliBangumiIE(BilibiliBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)' _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/ep(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ep21495/',
'info_dict': {
'id': '21495',
'ext': 'mp4',
'series': '悠久之翼',
'series_id': '774',
'season': '第二季',
'season_id': '1182',
'season_number': 2,
'episode': 'foreveref',
'episode_id': '21495',
'episode_number': 12,
'title': '12 foreveref',
'duration': 1420.791,
'timestamp': 1320412200,
'upload_date': '20111104',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
},
}, {
'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'url': 'https://www.bilibili.com/bangumi/play/ep267851',
'info_dict': { 'info_dict': {
'id': '267851', 'id': '267851',
'ext': 'mp4', 'ext': 'mp4',
'series': '鬼灭之刃', 'series': '鬼灭之刃',
'series_id': '4358', 'series_id': '4358',
'season': '鬼灭之刃', 'season': '立志篇',
'season_id': '26801', 'season_id': '26801',
'season_number': 1, 'season_number': 1,
'episode': '残酷', 'episode': '残酷',
@ -446,13 +639,32 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
'upload_date': '20190406', 'upload_date': '20190406',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
}, },
'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' 'skip': 'Geo-restricted',
}, {
'note': 'a making-of which falls outside main section',
'url': 'https://www.bilibili.com/bangumi/play/ep345120',
'info_dict': {
'id': '345120',
'ext': 'mp4',
'series': '鬼灭之刃',
'series_id': '4358',
'season': '立志篇',
'season_id': '26801',
'season_number': 1,
'episode': '炭治郎篇',
'episode_id': '345120',
'episode_number': 27,
'title': '#1 炭治郎篇',
'duration': 1922.129,
'timestamp': 1602853860,
'upload_date': '20201016',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) episode_id = self._match_id(url)
episode_id = video_id[2:] webpage = self._download_webpage(url, episode_id)
webpage = self._download_webpage(url, video_id)
if '您所在的地区无法观看本片' in webpage: if '您所在的地区无法观看本片' in webpage:
raise GeoRestrictedError('This video is restricted') raise GeoRestrictedError('This video is restricted')
@ -461,7 +673,7 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
headers = {'Referer': url, **self.geo_verification_headers()} headers = {'Referer': url, **self.geo_verification_headers()}
play_info = self._download_json( play_info = self._download_json(
'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, 'https://api.bilibili.com/pgc/player/web/v2/playurl', episode_id,
'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id},
headers=headers) headers=headers)
premium_only = play_info.get('code') == -10403 premium_only = play_info.get('code') == -10403
@ -472,40 +684,43 @@ class BiliBiliBangumiIE(BilibiliBaseIE):
self.raise_login_required('This video is for premium members only') self.raise_login_required('This video is for premium members only')
bangumi_info = self._download_json( bangumi_info = self._download_json(
'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', 'https://api.bilibili.com/pgc/view/web/season', episode_id, 'Get episode details',
query={'ep_id': episode_id}, headers=headers)['result'] query={'ep_id': episode_id}, headers=headers)['result']
episode_number, episode_info = next(( episode_number, episode_info = next((
(idx, ep) for idx, ep in enumerate(traverse_obj( (idx, ep) for idx, ep in enumerate(traverse_obj(
bangumi_info, ('episodes', ..., {dict})), 1) bangumi_info, (('episodes', ('section', ..., 'episodes')), ..., {dict})), 1)
if str_or_none(ep.get('id')) == episode_id), (1, {})) if str_or_none(ep.get('id')) == episode_id), (1, {}))
season_id = bangumi_info.get('season_id') season_id = bangumi_info.get('season_id')
season_number = season_id and next(( season_number, season_title = season_id and next((
idx + 1 for idx, e in enumerate( (idx + 1, e.get('season_title')) for idx, e in enumerate(
traverse_obj(bangumi_info, ('seasons', ...))) traverse_obj(bangumi_info, ('seasons', ...)))
if e.get('season_id') == season_id if e.get('season_id') == season_id
), None) ), (None, None))
aid = episode_info.get('aid') aid = episode_info.get('aid')
return { return {
'id': video_id, 'id': episode_id,
'formats': formats, 'formats': formats,
**traverse_obj(bangumi_info, { **traverse_obj(bangumi_info, {
'series': ('series', 'series_title', {str}), 'series': ('series', 'series_title', {str}),
'series_id': ('series', 'series_id', {str_or_none}), 'series_id': ('series', 'series_id', {str_or_none}),
'thumbnail': ('square_cover', {url_or_none}), 'thumbnail': ('square_cover', {url_or_none}),
}), }),
'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), **traverse_obj(episode_info, {
'episode': episode_info.get('long_title'), 'episode': ('long_title', {str}),
'episode_number': ('title', {int_or_none}, {lambda x: x or episode_number}),
'timestamp': ('pub_time', {int_or_none}),
'title': {lambda v: v and join_nonempty('title', 'long_title', delim=' ', from_dict=v)},
}),
'episode_id': episode_id, 'episode_id': episode_id,
'episode_number': int_or_none(episode_info.get('title')) or episode_number, 'season': str_or_none(season_title),
'season_id': str_or_none(season_id), 'season_id': str_or_none(season_id),
'season_number': season_number, 'season_number': season_number,
'timestamp': int_or_none(episode_info.get('pub_time')),
'duration': float_or_none(play_info.get('timelength'), scale=1000), 'duration': float_or_none(play_info.get('timelength'), scale=1000),
'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), 'subtitles': self.extract_subtitles(episode_id, episode_info.get('cid'), aid=aid),
'__post_extractor': self.extract_comments(aid), '__post_extractor': self.extract_comments(aid),
'http_headers': headers, 'http_headers': headers,
} }
@ -517,17 +732,53 @@ class BiliBiliBangumiMediaIE(BilibiliBaseIE):
'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'url': 'https://www.bilibili.com/bangumi/media/md24097891',
'info_dict': { 'info_dict': {
'id': '24097891', 'id': '24097891',
'title': 'CAROLE & TUESDAY',
'description': 'md5:42417ad33d1eaa1c93bfd2dd1626b829',
}, },
'playlist_mincount': 25, 'playlist_mincount': 25,
}, {
'url': 'https://www.bilibili.com/bangumi/media/md1565/',
'info_dict': {
'id': '1565',
'title': '攻壳机动队 S.A.C. 2nd GIG',
'description': 'md5:46cac00bafd645b97f4d6df616fc576d',
},
'playlist_count': 26,
'playlist': [{
'info_dict': {
'id': '68540',
'ext': 'mp4',
'series': '攻壳机动队',
'series_id': '1077',
'season': '第二季',
'season_id': '1565',
'season_number': 2,
'episode': '再启动 REEMBODY',
'episode_id': '68540',
'episode_number': 1,
'title': '1 再启动 REEMBODY',
'duration': 1525.777,
'timestamp': 1425074413,
'upload_date': '20150227',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
},
}],
}] }]
def _real_extract(self, url): def _real_extract(self, url):
media_id = self._match_id(url) media_id = self._match_id(url)
webpage = self._download_webpage(url, media_id) webpage = self._download_webpage(url, media_id)
ss_id = self._search_json(
r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id']
return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) initial_state = self._search_json(
r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)
ss_id = initial_state['mediaInfo']['season_id']
return self.playlist_result(
self._get_episodes_from_season(ss_id, url), media_id,
**traverse_obj(initial_state, ('mediaInfo', {
'title': ('title', {str}),
'description': ('evaluate', {str}),
})))
class BiliBiliBangumiSeasonIE(BilibiliBaseIE): class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
@ -535,15 +786,183 @@ class BiliBiliBangumiSeasonIE(BilibiliBaseIE):
_TESTS = [{ _TESTS = [{
'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'url': 'https://www.bilibili.com/bangumi/play/ss26801',
'info_dict': { 'info_dict': {
'id': '26801' 'id': '26801',
'title': '鬼灭之刃',
'description': 'md5:e2cc9848b6f69be6db79fc2a82d9661b',
}, },
'playlist_mincount': 26 'playlist_mincount': 26
}, {
'url': 'https://www.bilibili.com/bangumi/play/ss2251',
'info_dict': {
'id': '2251',
'title': '玲音',
'description': 'md5:1fd40e3df4c08d4d9d89a6a34844bdc4',
},
'playlist_count': 13,
'playlist': [{
'info_dict': {
'id': '50188',
'ext': 'mp4',
'series': '玲音',
'series_id': '1526',
'season': 'TV',
'season_id': '2251',
'season_number': 1,
'episode': 'WEIRD',
'episode_id': '50188',
'episode_number': 1,
'title': '1 WEIRD',
'duration': 1436.992,
'timestamp': 1343185080,
'upload_date': '20120725',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$'
},
}],
}] }]
def _real_extract(self, url): def _real_extract(self, url):
ss_id = self._match_id(url) ss_id = self._match_id(url)
webpage = self._download_webpage(url, ss_id)
metainfo = traverse_obj(
self._search_json(r'<script[^>]+type="application/ld\+json"[^>]*>', webpage, 'info', ss_id),
('itemListElement', ..., {
'title': ('name', {str}),
'description': ('description', {str}),
}), get_all=False)
return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id, **metainfo)
class BilibiliCheeseBaseIE(BilibiliBaseIE):
_HEADERS = {'Referer': 'https://www.bilibili.com/'}
def _extract_episode(self, season_info, ep_id):
episode_info = traverse_obj(season_info, (
'episodes', lambda _, v: v['id'] == int(ep_id)), get_all=False)
aid, cid = episode_info['aid'], episode_info['cid']
if traverse_obj(episode_info, 'ep_status') == -1:
raise ExtractorError('This course episode is not yet available.', expected=True)
if not traverse_obj(episode_info, 'playable'):
self.raise_login_required('You need to purchase the course to download this episode')
play_info = self._download_json(
'https://api.bilibili.com/pugv/player/web/playurl', ep_id,
query={'avid': aid, 'cid': cid, 'ep_id': ep_id, 'fnval': 16, 'fourk': 1},
headers=self._HEADERS, note='Downloading playinfo')['data']
return {
'id': str_or_none(ep_id),
'episode_id': str_or_none(ep_id),
'formats': self.extract_formats(play_info),
'extractor_key': BilibiliCheeseIE.ie_key(),
'extractor': BilibiliCheeseIE.IE_NAME,
'webpage_url': f'https://www.bilibili.com/cheese/play/ep{ep_id}',
**traverse_obj(episode_info, {
'episode': ('title', {str}),
'title': {lambda v: v and join_nonempty('index', 'title', delim=' - ', from_dict=v)},
'alt_title': ('subtitle', {str}),
'duration': ('duration', {int_or_none}),
'episode_number': ('index', {int_or_none}),
'thumbnail': ('cover', {url_or_none}),
'timestamp': ('release_date', {int_or_none}),
'view_count': ('play', {int_or_none}),
}),
**traverse_obj(season_info, {
'uploader': ('up_info', 'uname', {str}),
'uploader_id': ('up_info', 'mid', {str_or_none}),
}),
'subtitles': self.extract_subtitles(ep_id, cid, aid=aid),
'__post_extractor': self.extract_comments(aid),
'http_headers': self._HEADERS,
}
def _download_season_info(self, query_key, video_id):
return self._download_json(
f'https://api.bilibili.com/pugv/view/web/season?{query_key}={video_id}', video_id,
headers=self._HEADERS, note='Downloading season info')['data']
class BilibiliCheeseIE(BilibiliCheeseBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ep(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/cheese/play/ep229832',
'info_dict': {
'id': '229832',
'ext': 'mp4',
'title': '1 - 课程先导片',
'alt_title': '视频课·3分41秒',
'uploader': '马督工',
'uploader_id': '316568752',
'episode': '课程先导片',
'episode_id': '229832',
'episode_number': 1,
'duration': 221,
'timestamp': 1695549606,
'upload_date': '20230924',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'view_count': int,
}
}]
def _real_extract(self, url):
ep_id = self._match_id(url)
return self._extract_episode(self._download_season_info('ep_id', ep_id), ep_id)
class BilibiliCheeseSeasonIE(BilibiliCheeseBaseIE):
_VALID_URL = r'https?://(?:www\.)?bilibili\.com/cheese/play/ss(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.bilibili.com/cheese/play/ss5918',
'info_dict': {
'id': '5918',
'title': '【限时五折】新闻系学不到:马督工教你做自媒体',
'description': '帮普通人建立世界模型,降低人与人的沟通门槛',
},
'playlist': [{
'info_dict': {
'id': '229832',
'ext': 'mp4',
'title': '1 - 课程先导片',
'alt_title': '视频课·3分41秒',
'uploader': '马督工',
'uploader_id': '316568752',
'episode': '课程先导片',
'episode_id': '229832',
'episode_number': 1,
'duration': 221,
'timestamp': 1695549606,
'upload_date': '20230924',
'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$',
'view_count': int,
}
}],
'params': {'playlist_items': '1'},
}, {
'url': 'https://www.bilibili.com/cheese/play/ss5918',
'info_dict': {
'id': '5918',
'title': '【限时五折】新闻系学不到:马督工教你做自媒体',
'description': '帮普通人建立世界模型,降低人与人的沟通门槛',
},
'playlist_mincount': 5,
'skip': 'paid video in list',
}]
def _get_cheese_entries(self, season_info):
for ep_id in traverse_obj(season_info, ('episodes', lambda _, v: v['episode_can_view'], 'id')):
yield self._extract_episode(season_info, ep_id)
def _real_extract(self, url):
season_id = self._match_id(url)
season_info = self._download_season_info('season_id', season_id)
return self.playlist_result(
self._get_cheese_entries(season_info), season_id,
**traverse_obj(season_info, {
'title': ('title', {str}),
'description': ('subtitle', {str}),
}))
class BilibiliSpaceBaseIE(InfoExtractor): class BilibiliSpaceBaseIE(InfoExtractor):

View File

@ -2341,7 +2341,9 @@ class InfoExtractor:
imgs_count = 0 imgs_count = 0
srcs = set() srcs = set()
media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) media = itertools.chain.from_iterable(
smil.findall(self._xpath_ns(arg, namespace))
for arg in ['.//video', './/audio', './/media'])
for medium in media: for medium in media:
src = medium.get('src') src = medium.get('src')
if not src or src in srcs: if not src or src in srcs:

View File

@ -53,21 +53,6 @@ class DuoplayIE(InfoExtractor):
'episode_id': 14, 'episode_id': 14,
'release_year': 2010, 'release_year': 2010,
}, },
}, {
'note': 'Movie',
'url': 'https://duoplay.ee/4325/naljamangud',
'md5': '2b0bcac4159a08b1844c2bfde06b1199',
'info_dict': {
'id': '4325',
'ext': 'mp4',
'title': 'Näljamängud',
'thumbnail': r're:https://.+\.jpg(?:\?c=\d+)?$',
'description': 'md5:fb35f5eb2ff46cdb82e4d5fbe7b49a13',
'cast': ['Jennifer Lawrence', 'Josh Hutcherson', 'Liam Hemsworth'],
'upload_date': '20231109',
'timestamp': 1699552800,
'release_year': 2012,
},
}, { }, {
'note': 'Movie without expiry', 'note': 'Movie without expiry',
'url': 'https://duoplay.ee/5501/pilvede-all.-neljas-ode', 'url': 'https://duoplay.ee/5501/pilvede-all.-neljas-ode',

View File

@ -2,11 +2,9 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_parse_qs from ..compat import compat_parse_qs
from ..dependencies import websockets
from ..networking import Request from ..networking import Request
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
WebSocketsWrapper,
js_to_json, js_to_json,
traverse_obj, traverse_obj,
update_url_query, update_url_query,
@ -167,8 +165,6 @@ class FC2LiveIE(InfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
if not websockets:
raise ExtractorError('websockets library is not available. Please install it.', expected=True)
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id)
@ -199,13 +195,9 @@ class FC2LiveIE(InfoExtractor):
ws_url = update_url_query(control_server['url'], {'control_token': control_server['control_token']}) ws_url = update_url_query(control_server['url'], {'control_token': control_server['control_token']})
playlist_data = None playlist_data = None
self.to_screen('%s: Fetching HLS playlist info via WebSocket' % video_id) ws = self._request_webpage(Request(ws_url, headers={
ws = WebSocketsWrapper(ws_url, {
'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:],
'Origin': 'https://live.fc2.com', 'Origin': 'https://live.fc2.com',
'Accept': '*/*', }), video_id, note='Fetching HLS playlist info via WebSocket')
'User-Agent': self.get_param('http_headers')['User-Agent'],
})
self.write_debug('Sending HLS server request') self.write_debug('Sending HLS server request')

View File

@ -3,230 +3,306 @@ import json
from .common import InfoExtractor from .common import InfoExtractor
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ExtractorError, make_archive_id, parse_iso8601, remove_start from ..utils import (
ExtractorError,
int_or_none,
make_archive_id,
parse_iso8601,
smuggle_url,
try_call,
unsmuggle_url,
update_url_query,
url_or_none,
urljoin,
)
from ..utils.traversal import traverse_obj
_BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
class NebulaBaseIE(InfoExtractor): class NebulaBaseIE(InfoExtractor):
_NETRC_MACHINE = 'watchnebula' _NETRC_MACHINE = 'watchnebula'
_token = _api_token = None
_nebula_api_token = None def _perform_login(self, username, password):
_nebula_bearer_token = None
def _perform_nebula_auth(self, username, password):
if not username or not password:
self.raise_login_required(method='password')
data = json.dumps({'email': username, 'password': password}).encode('utf8')
response = self._download_json(
'https://api.watchnebula.com/api/v1/auth/login/',
data=data, fatal=False, video_id=None,
headers={
'content-type': 'application/json',
# Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
'cookie': ''
},
note='Logging in to Nebula with supplied credentials',
errnote='Authentication failed or rejected')
if not response or not response.get('key'):
self.raise_login_required(method='password')
return response['key']
def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
assert method in ('GET', 'POST',)
assert auth_type in ('api', 'bearer',)
def inner_call():
authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
return self._download_json(
url, video_id, note=note, headers={'Authorization': authorization},
data=b'' if method == 'POST' else None)
try: try:
return inner_call() response = self._download_json(
except ExtractorError as exc: 'https://nebula.tv/auth/login/', None,
# if 401 or 403, attempt credential re-auth and retry 'Logging in to Nebula', 'Login failed',
if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.status in (401, 403): data=json.dumps({'email': username, 'password': password}).encode(),
self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') headers={'content-type': 'application/json'})
self._perform_login() except ExtractorError as e:
return inner_call() if isinstance(e.cause, HTTPError) and e.cause.status == 400:
else: raise ExtractorError('Login failed: Invalid username or password', expected=True)
raise
self._api_token = traverse_obj(response, ('key', {str}))
if not self._api_token:
raise ExtractorError('Login failed: No token')
def _call_api(self, *args, **kwargs):
if self._token:
kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
try:
return self._download_json(*args, **kwargs)
except ExtractorError as e:
if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403):
raise
self.to_screen(
f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}')
self._real_initialize()
if self._token:
kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
return self._download_json(*args, **kwargs)
def _real_initialize(self):
if not self._api_token:
self._api_token = try_call(
lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value)
self._token = self._download_json(
'https://users.api.nebula.app/api/v1/authorization/', None,
headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None,
note='Authorizing to Nebula', data=b'')['token']
def _extract_formats(self, content_id, slug):
for retry in (False, True):
try:
fmts, subs = self._extract_m3u8_formats_and_subtitles(
f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8',
slug, 'mp4', query={
'token': self._token,
'app_version': '23.10.0',
'platform': 'ios',
})
return {'formats': fmts, 'subtitles': subs}
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 401:
self.raise_login_required()
if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403:
self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error')
self._real_initialize()
continue
raise raise
def _fetch_nebula_bearer_token(self): def _extract_video_metadata(self, episode):
""" channel_url = traverse_obj(
Get a Bearer token for the Nebula API. This will be required to fetch video meta data. episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False)
"""
response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
method='POST',
note='Authorizing to Nebula')
return response['token']
def _fetch_video_formats(self, slug):
stream_info = self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/stream/',
video_id=slug,
auth_type='bearer',
note='Fetching video stream info')
manifest_url = stream_info['manifest']
return self._extract_m3u8_formats_and_subtitles(manifest_url, slug, 'mp4')
def _build_video_info(self, episode):
fmts, subs = self._fetch_video_formats(episode['slug'])
channel_slug = episode['channel_slug']
channel_title = episode['channel_title']
zype_id = episode.get('zype_id')
return { return {
'id': remove_start(episode['id'], 'video_episode:'), 'id': episode['id'].partition(':')[2],
'display_id': episode['slug'], **traverse_obj(episode, {
'formats': fmts, 'display_id': 'slug',
'subtitles': subs, 'title': 'title',
'webpage_url': f'https://nebula.tv/{episode["slug"]}', 'description': 'description',
'title': episode['title'], 'timestamp': ('published_at', {parse_iso8601}),
'description': episode['description'], 'duration': ('duration', {int_or_none}),
'timestamp': parse_iso8601(episode['published_at']), 'channel_id': 'channel_slug',
'thumbnails': [{ 'uploader_id': 'channel_slug',
# 'id': tn.get('name'), # this appears to be null 'channel': 'channel_title',
'url': tn['original'], 'uploader': 'channel_title',
'height': key, 'series': 'channel_title',
} for key, tn in episode['assets']['thumbnail'].items()], 'creator': 'channel_title',
'duration': episode['duration'], 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}),
'channel': channel_title, 'episode_number': ('order', {int_or_none}),
'channel_id': channel_slug, # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE
'channel_url': f'https://nebula.tv/{channel_slug}', '_old_archive_ids': ('zype_id', {lambda x: [
'uploader': channel_title, make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}),
'uploader_id': channel_slug, }),
'uploader_url': f'https://nebula.tv/{channel_slug}', 'channel_url': channel_url,
'series': channel_title, 'uploader_url': channel_url,
'creator': channel_title,
'extractor_key': NebulaIE.ie_key(),
'extractor': NebulaIE.IE_NAME,
'_old_archive_ids': [make_archive_id(NebulaIE, zype_id)] if zype_id else None,
} }
def _perform_login(self, username=None, password=None):
self._nebula_api_token = self._perform_nebula_auth(username, password)
self._nebula_bearer_token = self._fetch_nebula_bearer_token()
class NebulaIE(NebulaBaseIE): class NebulaIE(NebulaBaseIE):
_VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)' _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)'
_TESTS = [ _TESTS = [{
{ 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast', 'info_dict': {
'md5': '14944cfee8c7beeea106320c47560efc', 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
'info_dict': { 'ext': 'mp4',
'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf', 'title': 'That Time Disney Remade Beauty and the Beast',
'ext': 'mp4', 'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4',
'title': 'That Time Disney Remade Beauty and the Beast', 'upload_date': '20180731',
'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We werent able to remove it without reducing video quality, so its presented here in its original context.', 'timestamp': 1533009600,
'upload_date': '20180731', 'channel': 'Lindsay Ellis',
'timestamp': 1533009600, 'channel_id': 'lindsayellis',
'channel': 'Lindsay Ellis', 'uploader': 'Lindsay Ellis',
'channel_id': 'lindsayellis', 'uploader_id': 'lindsayellis',
'uploader': 'Lindsay Ellis', 'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis',
'uploader_id': 'lindsayellis', 'series': 'Lindsay Ellis',
'uploader_url': 'https://nebula.tv/lindsayellis', 'display_id': 'that-time-disney-remade-beauty-and-the-beast',
'series': 'Lindsay Ellis', 'channel_url': r're:https://nebula\.(tv|app)/lindsayellis',
'display_id': 'that-time-disney-remade-beauty-and-the-beast', 'creator': 'Lindsay Ellis',
'channel_url': 'https://nebula.tv/lindsayellis', 'duration': 2212,
'creator': 'Lindsay Ellis', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
'duration': 2212, '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'],
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*',
},
}, },
{ 'params': {'skip_download': 'm3u8'},
'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', }, {
'md5': 'd05739cf6c38c09322422f696b569c23', 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
'info_dict': { 'md5': 'd05739cf6c38c09322422f696b569c23',
'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34', 'info_dict': {
'ext': 'mp4', 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34',
'title': 'Landing Craft - How The Allies Got Ashore', 'ext': 'mp4',
'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'title': 'Landing Craft - How The Allies Got Ashore',
'upload_date': '20200327', 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
'timestamp': 1585348140, 'upload_date': '20200327',
'channel': 'Real Engineering — The Logistics of D-Day', 'timestamp': 1585348140,
'channel_id': 'd-day', 'channel': 'Real Engineering — The Logistics of D-Day',
'uploader': 'Real Engineering — The Logistics of D-Day', 'channel_id': 'd-day',
'uploader_id': 'd-day', 'uploader': 'Real Engineering — The Logistics of D-Day',
'series': 'Real Engineering — The Logistics of D-Day', 'uploader_id': 'd-day',
'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', 'series': 'Real Engineering — The Logistics of D-Day',
'creator': 'Real Engineering — The Logistics of D-Day', 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
'duration': 841, 'creator': 'Real Engineering — The Logistics of D-Day',
'channel_url': 'https://nebula.tv/d-day', 'duration': 841,
'uploader_url': 'https://nebula.tv/d-day', 'channel_url': 'https://nebula.tv/d-day',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', 'uploader_url': 'https://nebula.tv/d-day',
}, 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
'_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'],
}, },
{ 'params': {'skip_download': 'm3u8'},
'url': 'https://nebula.tv/videos/money-episode-1-the-draw', }, {
'md5': 'ebe28a7ad822b9ee172387d860487868', 'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
'info_dict': { 'md5': 'ebe28a7ad822b9ee172387d860487868',
'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553', 'info_dict': {
'ext': 'mp4', 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553',
'title': 'Episode 1: The Draw', 'ext': 'mp4',
'description': r'contains:Theres free money on offer… if the players can all work together.', 'title': 'Episode 1: The Draw',
'upload_date': '20200323', 'description': r'contains:Theres free money on offer… if the players can all work together.',
'timestamp': 1584980400, 'upload_date': '20200323',
'channel': 'Tom Scott Presents: Money', 'timestamp': 1584980400,
'channel_id': 'tom-scott-presents-money', 'channel': 'Tom Scott Presents: Money',
'uploader': 'Tom Scott Presents: Money', 'channel_id': 'tom-scott-presents-money',
'uploader_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money',
'uploader_url': 'https://nebula.tv/tom-scott-presents-money', 'uploader_id': 'tom-scott-presents-money',
'duration': 825, 'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
'channel_url': 'https://nebula.tv/tom-scott-presents-money', 'duration': 825,
'series': 'Tom Scott Presents: Money', 'channel_url': 'https://nebula.tv/tom-scott-presents-money',
'display_id': 'money-episode-1-the-draw', 'series': 'Tom Scott Presents: Money',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', 'display_id': 'money-episode-1-the-draw',
'creator': 'Tom Scott Presents: Money', 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
}, 'creator': 'Tom Scott Presents: Money',
'_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'],
}, },
{ 'params': {'skip_download': 'm3u8'},
'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', }, {
'only_matching': True, 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
'only_matching': True,
}, {
'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
'info_dict': {
'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d',
'ext': 'mp4',
'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
'title': 'Did the US Really Blow Up the NordStream Pipelines?',
'description': 'md5:b4e2a14e3ff08f546a3209c75261e789',
'upload_date': '20230223',
'timestamp': 1677144070,
'channel': 'TLDR News EU',
'channel_id': 'tldrnewseu',
'uploader': 'TLDR News EU',
'uploader_id': 'tldrnewseu',
'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu',
'duration': 524,
'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu',
'series': 'TLDR News EU',
'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
'creator': 'TLDR News EU',
'_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'],
}, },
{ 'params': {'skip_download': 'm3u8'},
'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw', }, {
'only_matching': True, 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
}, 'only_matching': True,
] }]
def _fetch_video_metadata(self, slug):
return self._call_nebula_api(f'https://content.api.nebula.app/video/{slug}/',
video_id=slug,
auth_type='bearer',
note='Fetching video meta data')
def _real_extract(self, url): def _real_extract(self, url):
slug = self._match_id(url) slug = self._match_id(url)
video = self._fetch_video_metadata(slug) url, smuggled_data = unsmuggle_url(url, {})
return self._build_video_info(video) if smuggled_data.get('id'):
return {
'id': smuggled_data['id'],
'display_id': slug,
'title': '',
**self._extract_formats(smuggled_data['id'], slug),
}
metadata = self._call_api(
f'https://content.api.nebula.app/content/videos/{slug}',
slug, note='Fetching video metadata')
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
class NebulaClassIE(NebulaBaseIE):
IE_NAME = 'nebula:class'
_VALID_URL = rf'{_BASE_URL_RE}/(?P<id>[-\w]+)/(?P<ep>\d+)'
_TESTS = [{
'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
'info_dict': {
'id': 'd7432cdc-c608-474d-942c-f74345daed7b',
'ext': 'mp4',
'display_id': '14',
'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit',
'episode_number': 14,
'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9',
'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit',
'duration': 646,
'episode': 'Episode 14',
'title': 'Photos, Sculpture, and Video',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
slug, episode = self._match_valid_url(url).group('id', 'ep')
url, smuggled_data = unsmuggle_url(url, {})
if smuggled_data.get('id'):
return {
'id': smuggled_data['id'],
'display_id': slug,
'title': '',
**self._extract_formats(smuggled_data['id'], slug),
}
metadata = self._call_api(
f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
slug, note='Fetching video metadata')
return {
**self._extract_video_metadata(metadata),
**self._extract_formats(metadata['id'], slug),
}
class NebulaSubscriptionsIE(NebulaBaseIE): class NebulaSubscriptionsIE(NebulaBaseIE):
IE_NAME = 'nebula:subscriptions' IE_NAME = 'nebula:subscriptions'
_VALID_URL = rf'{_BASE_URL_RE}/myshows' _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)'
_TESTS = [ _TESTS = [{
{ 'url': 'https://nebula.tv/myshows',
'url': 'https://nebula.tv/myshows', 'playlist_mincount': 1,
'playlist_mincount': 1, 'info_dict': {
'info_dict': { 'id': 'myshows',
'id': 'myshows',
},
}, },
] }]
def _generate_playlist_entries(self): def _generate_playlist_entries(self):
next_url = 'https://content.watchnebula.com/library/video/?page_size=100' next_url = update_url_query('https://content.api.nebula.app/video_episodes/', {
page_num = 1 'following': 'true',
while next_url: 'include': 'engagement',
channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer', 'ordering': '-published_at',
note=f'Retrieving subscriptions page {page_num}') })
for page_num in itertools.count(1):
channel = self._call_api(
next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}')
for episode in channel['results']: for episode in channel['results']:
yield self._build_video_info(episode) metadata = self._extract_video_metadata(episode)
next_url = channel['next'] yield self.url_result(smuggle_url(
page_num += 1 f'https://nebula.tv/videos/{metadata["display_id"]}',
{'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
next_url = channel.get('next')
if not next_url:
return
def _real_extract(self, url): def _real_extract(self, url):
return self.playlist_result(self._generate_playlist_entries(), 'myshows') return self.playlist_result(self._generate_playlist_entries(), 'myshows')
@ -234,48 +310,74 @@ class NebulaSubscriptionsIE(NebulaBaseIE):
class NebulaChannelIE(NebulaBaseIE): class NebulaChannelIE(NebulaBaseIE):
IE_NAME = 'nebula:channel' IE_NAME = 'nebula:channel'
_VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)' _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos/)(?P<id>[-\w]+)/?(?:$|[?#])'
_TESTS = [ _TESTS = [{
{ 'url': 'https://nebula.tv/tom-scott-presents-money',
'url': 'https://nebula.tv/tom-scott-presents-money', 'info_dict': {
'info_dict': { 'id': 'tom-scott-presents-money',
'id': 'tom-scott-presents-money', 'title': 'Tom Scott Presents: Money',
'title': 'Tom Scott Presents: Money', 'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
},
'playlist_count': 5,
}, {
'url': 'https://nebula.tv/lindsayellis',
'info_dict': {
'id': 'lindsayellis',
'title': 'Lindsay Ellis',
'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
},
'playlist_mincount': 2,
}, },
] 'playlist_count': 5,
}, {
'url': 'https://nebula.tv/lindsayellis',
'info_dict': {
'id': 'lindsayellis',
'title': 'Lindsay Ellis',
'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
},
'playlist_mincount': 2,
}, {
'url': 'https://nebula.tv/johnnyharris',
'info_dict': {
'id': 'johnnyharris',
'title': 'Johnny Harris',
'description': 'I make videos about maps and many other things.',
},
'playlist_mincount': 90,
}, {
'url': 'https://nebula.tv/copyright-for-fun-and-profit',
'info_dict': {
'id': 'copyright-for-fun-and-profit',
'title': 'Copyright for Fun and Profit',
'description': 'md5:6690248223eed044a9f11cd5a24f9742',
},
'playlist_count': 23,
}]
def _generate_playlist_entries(self, collection_id, channel): def _generate_playlist_entries(self, collection_id, collection_slug):
episodes = channel['episodes']['results'] next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at'
for page_num in itertools.count(2): for page_num in itertools.count(1):
for episode in episodes: episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}')
yield self._build_video_info(episode) for episode in episodes['results']:
next_url = channel['episodes']['next'] metadata = self._extract_video_metadata(episode)
yield self.url_result(smuggle_url(
episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}',
{'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
next_url = episodes.get('next')
if not next_url: if not next_url:
break break
channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
note=f'Retrieving channel page {page_num}') def _generate_class_entries(self, channel):
episodes = channel['episodes']['results'] for lesson in channel['lessons']:
metadata = self._extract_video_metadata(lesson)
yield self.url_result(smuggle_url(
lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
{'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
def _real_extract(self, url): def _real_extract(self, url):
collection_id = self._match_id(url) collection_slug = self._match_id(url)
channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' channel = self._call_api(
channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons',
channel_details = channel['details'] collection_slug, note='Retrieving channel')
if channel.get('type') == 'class':
entries = self._generate_class_entries(channel)
else:
entries = self._generate_playlist_entries(channel['id'], collection_slug)
return self.playlist_result( return self.playlist_result(
entries=self._generate_playlist_entries(collection_id, channel), entries=entries,
playlist_id=collection_id, playlist_id=collection_slug,
playlist_title=channel_details['title'], playlist_title=channel.get('title'),
playlist_description=channel_details['description'] playlist_description=channel.get('description'))
)

View File

@ -8,12 +8,11 @@ import time
from urllib.parse import urlparse from urllib.parse import urlparse
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..dependencies import websockets from ..networking import Request
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
OnDemandPagedList, OnDemandPagedList,
WebSocketsWrapper,
bug_reports_message, bug_reports_message,
clean_html, clean_html,
float_or_none, float_or_none,
@ -934,8 +933,6 @@ class NiconicoLiveIE(InfoExtractor):
_KNOWN_LATENCY = ('high', 'low') _KNOWN_LATENCY = ('high', 'low')
def _real_extract(self, url): def _real_extract(self, url):
if not websockets:
raise ExtractorError('websockets library is not available. Please install it.', expected=True)
video_id = self._match_id(url) video_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
@ -950,17 +947,13 @@ class NiconicoLiveIE(InfoExtractor):
}) })
hostname = remove_start(urlparse(urlh.url).hostname, 'sp.') hostname = remove_start(urlparse(urlh.url).hostname, 'sp.')
cookies = try_get(urlh.url, self._downloader._calc_cookies)
latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
if latency not in self._KNOWN_LATENCY: if latency not in self._KNOWN_LATENCY:
latency = 'high' latency = 'high'
ws = WebSocketsWrapper(ws_url, { ws = self._request_webpage(
'Cookies': str_or_none(cookies) or '', Request(ws_url, headers={'Origin': f'https://{hostname}'}),
'Origin': f'https://{hostname}', video_id=video_id, note='Connecting to WebSocket server')
'Accept': '*/*',
'User-Agent': self.get_param('http_headers')['User-Agent'],
})
self.write_debug('[debug] Sending HLS server request') self.write_debug('[debug] Sending HLS server request')
ws.send(json.dumps({ ws.send(json.dumps({
@ -1034,7 +1027,6 @@ class NiconicoLiveIE(InfoExtractor):
'protocol': 'niconico_live', 'protocol': 'niconico_live',
'ws': ws, 'ws': ws,
'video_id': video_id, 'video_id': video_id,
'cookies': cookies,
'live_latency': latency, 'live_latency': latency,
'origin': hostname, 'origin': hostname,
}) })

View File

@ -1,57 +1,131 @@
import re import json
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .ooyala import OoyalaIE from ..utils import (
ExtractorError,
make_archive_id,
unified_timestamp,
urljoin,
)
from ..utils.traversal import traverse_obj
class NintendoIE(InfoExtractor): class NintendoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:games/detail|nintendo-direct)/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:(?P<locale>\w{2}(?:-\w{2})?)/)?nintendo-direct/(?P<slug>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.nintendo.com/games/detail/duck-hunt-wii-u/',
'info_dict': {
'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW',
'ext': 'flv',
'title': 'Duck Hunt Wii U VC NES - Trailer',
'duration': 60.326,
},
'params': {
'skip_download': True,
},
'add_ie': ['Ooyala'],
}, {
'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u',
'info_dict': {
'id': 'tokyo-mirage-sessions-fe-wii-u',
'title': 'Tokyo Mirage Sessions ♯FE',
},
'playlist_count': 4,
}, {
'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/', 'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/',
'info_dict': { 'info_dict': {
'id': 'J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Switch_ROS_ND0904-H264.mov', 'id': '2oPmiviVePUA1IqAZzjuVh',
'duration': 2324.758, 'display_id': '09-04-2019',
'title': 'Nintendo Direct 9.4.2019',
'timestamp': 1567580400,
'description': 'md5:8aac2780361d8cb772b6d1de66d7d6f4',
'upload_date': '20190904',
'age_limit': 17,
'_old_archive_ids': ['nintendo J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V'],
}, },
'params': { }, {
'skip_download': True, 'url': 'https://www.nintendo.com/en-ca/nintendo-direct/08-31-2023/',
'info_dict': {
'ext': 'mp4',
'id': '2TB2w2rJhNYF84qQ9E57hU',
'display_id': '08-31-2023',
'title': 'Super Mario Bros. Wonder Direct 8.31.2023',
'timestamp': 1693465200,
'description': 'md5:3067c5b824bcfdae9090a7f38ab2d200',
'tags': ['Mild Fantasy Violence', 'In-Game Purchases'],
'upload_date': '20230831',
'age_limit': 6,
},
}, {
'url': 'https://www.nintendo.com/us/nintendo-direct/50-fact-extravaganza/',
'info_dict': {
'ext': 'mp4',
'id': 'j0BBGzfw0pQ',
'channel_follower_count': int,
'view_count': int,
'description': 'Learn new details about Super Smash Bros. for Wii U, which launches on November 21.',
'duration': 2123,
'availability': 'public',
'thumbnail': 'https://i.ytimg.com/vi_webp/j0BBGzfw0pQ/maxresdefault.webp',
'timestamp': 1414047600,
'channel_id': 'UCGIY_O-8vW4rfX98KlMkvRg',
'chapters': 'count:53',
'heatmap': 'count:100',
'upload_date': '20141023',
'uploader_id': '@NintendoAmerica',
'playable_in_embed': True,
'categories': ['Gaming'],
'display_id': '50-fact-extravaganza',
'channel': 'Nintendo of America',
'tags': ['Comic Mischief', 'Cartoon Violence', 'Mild Suggestive Themes'],
'like_count': int,
'channel_url': 'https://www.youtube.com/channel/UCGIY_O-8vW4rfX98KlMkvRg',
'age_limit': 10,
'uploader_url': 'https://www.youtube.com/@NintendoAmerica',
'comment_count': int,
'live_status': 'not_live',
'uploader': 'Nintendo of America',
'title': '50-FACT Extravaganza',
}, },
'add_ie': ['Ooyala'],
}] }]
def _create_asset_url(self, path):
return urljoin('https://assets.nintendo.com/', urllib.parse.quote(path))
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) locale, slug = self._match_valid_url(url).group('locale', 'slug')
webpage = self._download_webpage(url, page_id) language, _, country = (locale or 'US').rpartition('-')
parsed_locale = f'{language.lower() or "en"}_{country.upper()}'
self.write_debug(f'Using locale {parsed_locale} (from {locale})', only_once=True)
entries = [ response = self._download_json('https://graph.nintendo.com/', slug, query={
OoyalaIE._build_url_result(m.group('code')) 'operationName': 'NintendoDirect',
for m in re.finditer( 'variables': json.dumps({
r'data-(?:video-id|directVideoId)=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage)] 'locale': parsed_locale,
'slug': slug,
}, separators=(',', ':')),
'extensions': json.dumps({
'persistedQuery': {
'version': 1,
'sha256Hash': '969b16fe9f08b686fa37bc44d1fd913b6188e65794bb5e341c54fa683a8004cb'
},
}, separators=(',', ':')),
})
# API returns `{"data": {"direct": null}}` if no matching id
direct_info = traverse_obj(response, ('data', 'direct', {dict}))
if not direct_info:
raise ExtractorError(f'No Nintendo Direct with id {slug} exists', expected=True)
title = self._html_search_regex( errors = ', '.join(traverse_obj(response, ('errors', ..., 'message')))
r'(?s)<(?:span|div)[^>]+class="(?:title|wrapper)"[^>]*>.*?<h1>(.+?)</h1>', if errors:
webpage, 'title', fatal=False) raise ExtractorError(f'GraphQL API error: {errors or "Unknown error"}')
return self.playlist_result( result = traverse_obj(direct_info, {
entries, page_id, title) 'id': ('id', {str}),
'title': ('name', {str}),
'timestamp': ('startDate', {unified_timestamp}),
'description': ('description', 'text', {str}),
'age_limit': ('contentRating', 'order', {int}),
'tags': ('contentDescriptors', ..., 'label', {str}),
'thumbnail': ('thumbnail', {self._create_asset_url}),
})
result['display_id'] = slug
asset_id = traverse_obj(direct_info, ('video', 'publicId', {str}))
if not asset_id:
youtube_id = traverse_obj(direct_info, ('liveStream', {str}))
if not youtube_id:
self.raise_no_formats('Could not find any video formats', video_id=slug)
return self.url_result(youtube_id, **result, url_transparent=True)
if asset_id.startswith('Legacy Videos/'):
result['_old_archive_ids'] = [make_archive_id(self, asset_id[14:])]
result['formats'] = self._extract_m3u8_formats(
self._create_asset_url(f'/video/upload/sp_full_hd/v1/{asset_id}.m3u8'), slug)
return result

View File

@ -35,6 +35,7 @@ class PeriscopeBaseIE(InfoExtractor):
'uploader_id': broadcast.get('user_id') or broadcast.get('username'), 'uploader_id': broadcast.get('user_id') or broadcast.get('username'),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'view_count': int_or_none(broadcast.get('total_watched')), 'view_count': int_or_none(broadcast.get('total_watched')),
'concurrent_view_count': int_or_none(broadcast.get('total_watching')),
'tags': broadcast.get('tags'), 'tags': broadcast.get('tags'),
'live_status': { 'live_status': {
'running': 'is_live', 'running': 'is_live',

View File

@ -0,0 +1,135 @@
import itertools
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_class,
get_elements_html_by_class,
parse_qs,
traverse_obj,
unified_strdate,
urljoin
)
class TheGuardianPodcastIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
'md5': 'd1771744681789b4cd7da2a08e487702',
'info_dict': {
'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
'ext': 'mp3',
'title': 'We are just getting started: the plastic-eating bacteria that could change the world podcast',
'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee',
'creator': 'Stephen Buranyi',
'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79',
'release_date': '20231103'
}
}, {
'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
'md5': 'd1771744681789b4cd7da2a08e487702',
'info_dict': {
'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
'ext': 'mp3',
'title': 'The trials of Robert Habeck: is the worlds most powerful green politician doomed to fail? podcast',
'description': 'md5:1b5cf6582d1771c6b7077784b5456994',
'creator': 'Philip Oltermann',
'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080',
'release_date': '20231030'
}
}, {
'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
'md5': 'a2fcff6f8e060a95b1483295273dc35e',
'info_dict': {
'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
'ext': 'mp3',
'title': 'Arsenal feel hard done by and Luton hold Liverpool Football Weekly',
'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a',
'creator': 'Max Rushden',
'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd',
'release_date': '20231106'
}
}, {
'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast',
'md5': '06a0f7e9701a80c8064a5d35690481ec',
'info_dict': {
'id': 'the-covid-inquiry-politics-weekly-uk-podcast',
'ext': 'mp3',
'title': 'The Covid inquiry | Politics Weekly UK - podcast',
'description': 'md5:207c98859c14903582b17d25b014046e',
'creator': 'Gaby Hinsliff',
'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3',
'release_date': '20231102'
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
return {
'id': video_id,
'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage),
'description': self._og_search_description(webpage),
'creator': self._html_search_meta('author', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)),
'url': extract_attributes(get_element_html_by_class(
'podcast__player', webpage) or '').get('data-source'),
}
class TheGuardianPodcastPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?'
_TESTS = [{
'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly',
'info_dict': {
'id': 'theguardianswomensfootballweekly',
'title': "The Guardian's Women's Football Weekly",
'description': 'md5:e2cc021311e582d29935a73614a43f51'
},
'playlist_mincount': 69
}, {
'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2',
'info_dict': {
'id': 'todayinfocus',
'title': 'Today in Focus',
'description': 'md5:0f097764fc0d359e0b6eb537be0387e2'
},
'playlist_mincount': 1261
}, {
'url': 'https://www.theguardian.com/news/series/the-audio-long-read',
'info_dict': {
'id': 'the-audio-long-read',
'title': 'The Audio Long Read',
'description': 'md5:5462994a27527309562b25b6defc4ef3'
},
'playlist_mincount': 996
}]
def _entries(self, url, playlist_id):
for page in itertools.count(1):
webpage, urlh = self._download_webpage_handle(
url, playlist_id, f'Downloading page {page}', query={'page': page})
if 'page' not in parse_qs(urlh.url):
break
episodes = get_elements_html_by_class('fc-item--type-media', webpage)
for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'data-id')):
yield url_path
def _real_extract(self, url):
podcast_id = self._match_id(url)
webpage = self._download_webpage(url, podcast_id)
title = clean_html(get_element_by_class(
'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage))
description = self._og_search_description(webpage) or self._html_search_meta(
'description', webpage)
return self.playlist_from_matches(
self._entries(url, podcast_id), podcast_id, title, description=description,
ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x))

View File

@ -11,7 +11,6 @@ from ..utils import (
float_or_none, float_or_none,
get_element_by_class, get_element_by_class,
get_element_by_id, get_element_by_id,
int_or_none,
parse_duration, parse_duration,
qualities, qualities,
str_to_int, str_to_int,
@ -242,35 +241,31 @@ class TwitCastingLiveIE(InfoExtractor):
'expected_exception': 'UserNotLive', 'expected_exception': 'UserNotLive',
}] }]
_PROTECTED_LIVE_RE = r'(?s)(<span\s*class="tw-movie-thumbnail2-badge"\s*data-status="live">\s*LIVE)'
def _real_extract(self, url): def _real_extract(self, url):
uploader_id = self._match_id(url) uploader_id = self._match_id(url)
self.to_screen( self.to_screen(
'Downloading live video of user {0}. ' 'Downloading live video of user {0}. '
'Pass "https://twitcasting.tv/{0}/show" to download the history'.format(uploader_id)) 'Pass "https://twitcasting.tv/{0}/show" to download the history'.format(uploader_id))
webpage = self._download_webpage(url, uploader_id) is_live = traverse_obj(self._download_json(
is_live = self._search_regex( # first pattern is for public live f'https://frontendapi.twitcasting.tv/watch/user/{uploader_id}',
(r'(data-is-onlive="true")', self._PROTECTED_LIVE_RE), webpage, 'is live?', default=None) uploader_id, 'Checking live status', data=b'', fatal=False), ('is_live', {bool}))
current_live = int_or_none(self._search_regex( if is_live is False: # only raise here if API response was as expected
(r'data-type="movie" data-id="(\d+)">', # not available?
r'tw-sound-flag-open-link" data-id="(\d+)" style=', # not available?
r'data-movie-id="(\d+)"'), # if not currently live, value may be 0
webpage, 'current live ID', default=None))
if is_live and not current_live:
# fetch unfiltered /show to find running livestreams; we can't get ID of the password-protected livestream above
webpage = self._download_webpage(
f'https://twitcasting.tv/{uploader_id}/show/', uploader_id,
note='Downloading live history')
is_live = self._search_regex(self._PROTECTED_LIVE_RE, webpage, 'is live?', default=None)
if is_live:
# get the first live; running live is always at the first
current_live = self._search_regex(
r'(?s)<a\s+class="tw-movie-thumbnail2"\s*href="/[^/]+/movie/(?P<video_id>\d+)"\s*>.+?</a>',
webpage, 'current live ID 2', default=None, group='video_id')
if not current_live:
raise UserNotLive(video_id=uploader_id) raise UserNotLive(video_id=uploader_id)
# Use /show/ page so that password-protected and members-only livestreams can be found
webpage = self._download_webpage(
f'https://twitcasting.tv/{uploader_id}/show/', uploader_id, 'Downloading live history')
is_live = is_live or self._search_regex(
r'(?s)(<span\s*class="tw-movie-thumbnail2-badge"\s*data-status="live">\s*LIVE)',
webpage, 'is live?', default=False)
# Current live is always the first match
current_live = self._search_regex(
r'(?s)<a\s+class="tw-movie-thumbnail2"\s+href="/[^/"]+/movie/(?P<video_id>\d+)"',
webpage, 'current live ID', default=None, group='video_id')
if not is_live or not current_live:
raise UserNotLive(video_id=uploader_id)
return self.url_result(f'https://twitcasting.tv/{uploader_id}/movie/{current_live}', TwitCastingIE) return self.url_result(f'https://twitcasting.tv/{uploader_id}/movie/{current_live}', TwitCastingIE)

83
yt_dlp/extractor/vidly.py Normal file
View File

@ -0,0 +1,83 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
mimetype2ext,
url_or_none,
)
from ..utils.traversal import traverse_obj
class VidlyIE(InfoExtractor):
_VALID_URL = r'https?://(?:vid\.ly/|(?:s\.)?vid\.ly/embeded\.html\?(?:[^#]+&)?link=)(?P<id>\w+)'
_EMBED_REGEX = [r'<script[^>]+\bsrc=[\'"](?P<url>(?:https?:)?//vid\.ly/\w+/embed[^\'"]+)',
r'<iframe[^>]+\bsrc=[\'"](?P<url>(?:https?:)?//(?:s\.)?vid\.ly/embeded\.html\?(?:[^#\'"]+&)?link=\w+[^\'"]+)']
_TESTS = [{
# JWPlayer 7, Embeds forbidden
'url': 'https://vid.ly/2i3o9j/embed',
'info_dict': {
'id': '2i3o9j',
'ext': 'mp4',
'title': '2i3o9j',
'thumbnail': r're:https://\w+\.cloudfront\.net/',
},
}, {
# JWPlayer 6
'url': 'http://s.vid.ly/embeded.html?link=jw_test&new=1&autoplay=true&controls=true',
'info_dict': {
'id': 'jw_test',
'ext': 'mp4',
'title': '2x8m8t',
'thumbnail': r're:https://\w+\.cloudfront\.net/',
},
}, {
# Vidlyplayer
'url': 'https://vid.ly/7x0e6l',
'info_dict': {
'id': '7x0e6l',
'ext': 'mp4',
'title': '7x0e6l',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.petfinder.com/dog/gus-57378930/tn/ooltewah/furever-furkids-rescue-tn592/',
'info_dict': {
'id': 'w8p5b0',
'ext': 'mp4',
'title': 'w8p5b0',
'thumbnail': r're:https://\w+\.cloudfront\.net/',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
embed_script = self._download_webpage(
f'https://vid.ly/{video_id}/embed', video_id, headers={'Referer': 'https://vid.ly/'})
player = self._search_json(r'initCallback\(', embed_script, 'player', video_id)
player_type = player.get('player') or ''
if player_type.startswith('jwplayer'):
return self._parse_jwplayer_data(player['config'], video_id)
elif not player_type.startswith('vidly'):
raise ExtractorError(f'Unknown player type {player_type!r}')
formats = []
ext = mimetype2ext(traverse_obj(player, ('config', 'type')))
for source, fid in [('source', 'sd'), ('source_hd', 'hd')]:
if traverse_obj(player, ('config', source, {url_or_none})):
formats.append({
'url': player['config'][source],
'format_id': f'http-{fid}',
'ext': ext,
})
# Has higher quality formats
formats.extend(self._extract_m3u8_formats(
f'https://d3fenhwk93s16g.cloudfront.net/{video_id}/hls.m3u8', video_id,
fatal=False, note='Requesting higher quality m3u8 formats',
errnote='No higher quality m3u8 formats found') or [])
return {
'id': video_id,
'title': video_id,
'formats': formats,
}

View File

@ -57,7 +57,7 @@ class VocarooIE(InfoExtractor):
'title': '', 'title': '',
'url': url, 'url': url,
'ext': 'mp3', 'ext': 'mp3',
'timestamp': float_or_none(resp.getheader('x-bz-upload-timestamp'), scale=1000), 'timestamp': float_or_none(resp.headers.get('x-bz-upload-timestamp'), scale=1000),
'vcodec': 'none', 'vcodec': 'none',
'http_headers': http_headers, 'http_headers': http_headers,
} }

View File

@ -1,3 +1,4 @@
import functools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
@ -14,21 +15,21 @@ class VVVVIDIE(InfoExtractor):
_VALID_URL = r'%s(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)' % _VALID_URL_BASE _VALID_URL = r'%s(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)' % _VALID_URL_BASE
_TESTS = [{ _TESTS = [{
# video_type == 'video/vvvvid' # video_type == 'video/vvvvid'
'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', 'url': 'https://www.vvvvid.it/show/498/the-power-of-computing/518/505692/playstation-vr-cambiera-il-nostro-modo-di-giocare',
'md5': 'b8d3cecc2e981adc3835adf07f6df91b',
'info_dict': { 'info_dict': {
'id': '489048', 'id': '505692',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Ping Pong', 'title': 'Playstation VR cambierà il nostro modo di giocare',
'duration': 239, 'duration': 93,
'series': '"Perché dovrei guardarlo?" di Dario Moccia', 'series': 'The Power of Computing',
'season_id': '437', 'season_id': '518',
'episode': 'Ping Pong', 'episode': 'Playstation VR cambierà il nostro modo di giocare',
'episode_number': 1, 'episode_number': None,
'episode_id': '3334', 'episode_id': '4747',
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'thumbnail': 'https://static.vvvvid.it/img/zoomin/28CA2409-E663-34F0-2B02E72356556EA3_500k.jpg',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -36,7 +37,6 @@ class VVVVIDIE(InfoExtractor):
}, { }, {
# video_type == 'video/rcs' # video_type == 'video/rcs'
'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01',
'md5': '33e0edfba720ad73a8782157fdebc648',
'info_dict': { 'info_dict': {
'id': '482493', 'id': '482493',
'ext': 'mp4', 'ext': 'mp4',
@ -45,6 +45,7 @@ class VVVVIDIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'skip': 'Every video/rcs is not working even in real website',
}, { }, {
# video_type == 'video/youtube' # video_type == 'video/youtube'
'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer',
@ -55,19 +56,54 @@ class VVVVIDIE(InfoExtractor):
'title': 'Trailer', 'title': 'Trailer',
'upload_date': '20150906', 'upload_date': '20150906',
'description': 'md5:a5e802558d35247fee285875328c0b80', 'description': 'md5:a5e802558d35247fee285875328c0b80',
'uploader_id': 'BandaiVisual', 'uploader_id': '@EMOTIONLabelChannel',
'uploader': 'BANDAI NAMCO Arts Channel', 'uploader': 'EMOTION Label Channel',
'episode_number': None,
'episode_id': '3115',
'view_count': int,
'like_count': int,
'repost_count': int,
'availability': str,
'categories': list,
'age_limit': 0,
'channel': 'EMOTION Label Channel',
'channel_follower_count': int,
'channel_id': 'UCQ5URCSs1f5Cz9rh-cDGxNQ',
'channel_url': 'https://www.youtube.com/channel/UCQ5URCSs1f5Cz9rh-cDGxNQ',
'comment_count': int,
'duration': 133,
'episode': 'Trailer',
'heatmap': list,
'live_status': 'not_live',
'playable_in_embed': True,
'season_id': '406',
'series': 'One-Punch Man',
'tags': list,
'uploader_url': 'https://www.youtube.com/@EMOTIONLabelChannel',
'thumbnail': 'https://i.ytimg.com/vi/RzmFKUDOUgw/maxresdefault.jpg',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
# video_type == 'video/dash' # video_type == 'video/dash'
'url': 'https://www.vvvvid.it/show/683/made-in-abyss/1542/693786/nanachi', 'url': 'https://www.vvvvid.it/show/844/le-bizzarre-avventure-di-jojo-vento-aureo/938/527551/golden-wind',
'info_dict': { 'info_dict': {
'id': '693786', 'id': '527551',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Nanachi', 'title': 'Golden Wind',
'duration': 1430,
'series': 'Le bizzarre avventure di Jojo - Vento Aureo',
'season_id': '938',
'episode': 'Golden Wind',
'episode_number': 1,
'episode_id': '9089',
'view_count': int,
'like_count': int,
'repost_count': int,
'thumbnail': 'https://static.vvvvid.it/img/thumbs/Dynit/Jojo/Jojo_S05Ep01-t.jpg',
'season': 'Season 5',
'season_number': 5,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -79,10 +115,17 @@ class VVVVIDIE(InfoExtractor):
}] }]
_conn_id = None _conn_id = None
@functools.cached_property
def _headers(self):
return {
**self.geo_verification_headers(),
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.50 Safari/537.37',
}
def _real_initialize(self): def _real_initialize(self):
self._conn_id = self._download_json( self._conn_id = self._download_json(
'https://www.vvvvid.it/user/login', 'https://www.vvvvid.it/user/login',
None, headers=self.geo_verification_headers())['data']['conn_id'] None, headers=self._headers)['data']['conn_id']
def _download_info(self, show_id, path, video_id, fatal=True, query=None): def _download_info(self, show_id, path, video_id, fatal=True, query=None):
q = { q = {
@ -92,7 +135,7 @@ class VVVVIDIE(InfoExtractor):
q.update(query) q.update(query)
response = self._download_json( response = self._download_json(
'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path),
video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) video_id, headers=self._headers, query=q, fatal=fatal)
if not (response or fatal): if not (response or fatal):
return return
if response.get('result') == 'error': if response.get('result') == 'error':
@ -219,7 +262,7 @@ class VVVVIDIE(InfoExtractor):
embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False))
else: else:
formats.extend(self._extract_wowza_formats( formats.extend(self._extract_wowza_formats(
'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id, skip_protocols=['f4m']))
metadata_from_url(embed_code) metadata_from_url(embed_code)
if not is_youtube: if not is_youtube:

View File

@ -21,3 +21,11 @@ except ImportError:
pass pass
except Exception as e: except Exception as e:
warnings.warn(f'Failed to import "requests" request handler: {e}' + bug_reports_message()) warnings.warn(f'Failed to import "requests" request handler: {e}' + bug_reports_message())
try:
from . import _websockets
except ImportError:
pass
except Exception as e:
warnings.warn(f'Failed to import "websockets" request handler: {e}' + bug_reports_message())

View File

@ -0,0 +1,159 @@
from __future__ import annotations
import io
import logging
import ssl
import sys
from ._helper import create_connection, select_proxy, make_socks_proxy_opts, create_socks_proxy_socket
from .common import Response, register_rh, Features
from .exceptions import (
CertificateVerifyError,
HTTPError,
RequestError,
SSLError,
TransportError, ProxyError,
)
from .websocket import WebSocketRequestHandler, WebSocketResponse
from ..compat import functools
from ..dependencies import websockets
from ..utils import int_or_none
from ..socks import ProxyError as SocksProxyError
if not websockets:
raise ImportError('websockets is not installed')
import websockets.version
websockets_version = tuple(map(int_or_none, websockets.version.version.split('.')))
if websockets_version < (12, 0):
raise ImportError('Only websockets>=12.0 is supported')
import websockets.sync.client
from websockets.uri import parse_uri
class WebsocketsResponseAdapter(WebSocketResponse):
def __init__(self, wsw: websockets.sync.client.ClientConnection, url):
super().__init__(
fp=io.BytesIO(wsw.response.body or b''),
url=url,
headers=wsw.response.headers,
status=wsw.response.status_code,
reason=wsw.response.reason_phrase,
)
self.wsw = wsw
def close(self):
self.wsw.close()
super().close()
def send(self, message):
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.send
try:
return self.wsw.send(message)
except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e:
raise TransportError(cause=e) from e
except SocksProxyError as e:
raise ProxyError(cause=e) from e
except TypeError as e:
raise RequestError(cause=e) from e
def recv(self):
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.recv
try:
return self.wsw.recv()
except SocksProxyError as e:
raise ProxyError(cause=e) from e
except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e:
raise TransportError(cause=e) from e
@register_rh
class WebsocketsRH(WebSocketRequestHandler):
"""
Websockets request handler
https://websockets.readthedocs.io
https://github.com/python-websockets/websockets
"""
_SUPPORTED_URL_SCHEMES = ('wss', 'ws')
_SUPPORTED_PROXY_SCHEMES = ('socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_FEATURES = (Features.ALL_PROXY, Features.NO_PROXY)
RH_NAME = 'websockets'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
for name in ('websockets.client', 'websockets.server'):
logger = logging.getLogger(name)
handler = logging.StreamHandler(stream=sys.stdout)
handler.setFormatter(logging.Formatter(f'{self.RH_NAME}: %(message)s'))
logger.addHandler(handler)
if self.verbose:
logger.setLevel(logging.DEBUG)
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('timeout', None)
extensions.pop('cookiejar', None)
def _send(self, request):
timeout = float(request.extensions.get('timeout') or self.timeout)
headers = self._merge_headers(request.headers)
if 'cookie' not in headers:
cookiejar = request.extensions.get('cookiejar') or self.cookiejar
cookie_header = cookiejar.get_cookie_header(request.url)
if cookie_header:
headers['cookie'] = cookie_header
wsuri = parse_uri(request.url)
create_conn_kwargs = {
'source_address': (self.source_address, 0) if self.source_address else None,
'timeout': timeout
}
proxy = select_proxy(request.url, request.proxies or self.proxies or {})
try:
if proxy:
socks_proxy_options = make_socks_proxy_opts(proxy)
sock = create_connection(
address=(socks_proxy_options['addr'], socks_proxy_options['port']),
_create_socket_func=functools.partial(
create_socks_proxy_socket, (wsuri.host, wsuri.port), socks_proxy_options),
**create_conn_kwargs
)
else:
sock = create_connection(
address=(wsuri.host, wsuri.port),
**create_conn_kwargs
)
conn = websockets.sync.client.connect(
sock=sock,
uri=request.url,
additional_headers=headers,
open_timeout=timeout,
user_agent_header=None,
ssl_context=self._make_sslcontext() if wsuri.secure else None,
close_timeout=0, # not ideal, but prevents yt-dlp hanging
)
return WebsocketsResponseAdapter(conn, url=request.url)
# Exceptions as per https://websockets.readthedocs.io/en/stable/reference/sync/client.html
except SocksProxyError as e:
raise ProxyError(cause=e) from e
except websockets.exceptions.InvalidURI as e:
raise RequestError(cause=e) from e
except ssl.SSLCertVerificationError as e:
raise CertificateVerifyError(cause=e) from e
except ssl.SSLError as e:
raise SSLError(cause=e) from e
except websockets.exceptions.InvalidStatus as e:
raise HTTPError(
Response(
fp=io.BytesIO(e.response.body),
url=request.url,
headers=e.response.headers,
status=e.response.status_code,
reason=e.response.reason_phrase),
) from e
except (OSError, TimeoutError, websockets.exceptions.WebSocketException) as e:
raise TransportError(cause=e) from e

View File

@ -0,0 +1,23 @@
from __future__ import annotations
import abc
from .common import Response, RequestHandler
class WebSocketResponse(Response):
def send(self, message: bytes | str):
"""
Send a message to the server.
@param message: The message to send. A string (str) is sent as a text frame, bytes is sent as a binary frame.
"""
raise NotImplementedError
def recv(self):
raise NotImplementedError
class WebSocketRequestHandler(RequestHandler, abc.ABC):
pass

View File

@ -1,4 +1,6 @@
"""No longer used and new code should not use. Exists only for API compat.""" """No longer used and new code should not use. Exists only for API compat."""
import asyncio
import atexit
import platform import platform
import struct import struct
import sys import sys
@ -32,6 +34,77 @@ has_certifi = bool(certifi)
has_websockets = bool(websockets) has_websockets = bool(websockets)
class WebSocketsWrapper:
"""Wraps websockets module to use in non-async scopes"""
pool = None
def __init__(self, url, headers=None, connect=True, **ws_kwargs):
self.loop = asyncio.new_event_loop()
# XXX: "loop" is deprecated
self.conn = websockets.connect(
url, extra_headers=headers, ping_interval=None,
close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'), **ws_kwargs)
if connect:
self.__enter__()
atexit.register(self.__exit__, None, None, None)
def __enter__(self):
if not self.pool:
self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
return self
def send(self, *args):
self.run_with_loop(self.pool.send(*args), self.loop)
def recv(self, *args):
return self.run_with_loop(self.pool.recv(*args), self.loop)
def __exit__(self, type, value, traceback):
try:
return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
finally:
self.loop.close()
self._cancel_all_tasks(self.loop)
# taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
# for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
@staticmethod
def run_with_loop(main, loop):
if not asyncio.iscoroutine(main):
raise ValueError(f'a coroutine was expected, got {main!r}')
try:
return loop.run_until_complete(main)
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
if hasattr(loop, 'shutdown_default_executor'):
loop.run_until_complete(loop.shutdown_default_executor())
@staticmethod
def _cancel_all_tasks(loop):
to_cancel = asyncio.all_tasks(loop)
if not to_cancel:
return
for task in to_cancel:
task.cancel()
# XXX: "loop" is removed in python 3.10+
loop.run_until_complete(
asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
for task in to_cancel:
if task.cancelled():
continue
if task.exception() is not None:
loop.call_exception_handler({
'message': 'unhandled exception during asyncio.run() shutdown',
'exception': task.exception(),
'task': task,
})
def load_plugins(name, suffix, namespace): def load_plugins(name, suffix, namespace):
from ..plugins import load_plugins from ..plugins import load_plugins
ret = load_plugins(name, suffix) ret = load_plugins(name, suffix)

View File

@ -1,5 +1,3 @@
import asyncio
import atexit
import base64 import base64
import binascii import binascii
import calendar import calendar
@ -54,7 +52,7 @@ from ..compat import (
compat_os_name, compat_os_name,
compat_shlex_quote, compat_shlex_quote,
) )
from ..dependencies import websockets, xattr from ..dependencies import xattr
__name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
@ -4923,77 +4921,6 @@ class Config:
return self.parser.parse_args(self.all_args) return self.parser.parse_args(self.all_args)
class WebSocketsWrapper:
"""Wraps websockets module to use in non-async scopes"""
pool = None
def __init__(self, url, headers=None, connect=True):
self.loop = asyncio.new_event_loop()
# XXX: "loop" is deprecated
self.conn = websockets.connect(
url, extra_headers=headers, ping_interval=None,
close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
if connect:
self.__enter__()
atexit.register(self.__exit__, None, None, None)
def __enter__(self):
if not self.pool:
self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
return self
def send(self, *args):
self.run_with_loop(self.pool.send(*args), self.loop)
def recv(self, *args):
return self.run_with_loop(self.pool.recv(*args), self.loop)
def __exit__(self, type, value, traceback):
try:
return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
finally:
self.loop.close()
self._cancel_all_tasks(self.loop)
# taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
# for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
@staticmethod
def run_with_loop(main, loop):
if not asyncio.iscoroutine(main):
raise ValueError(f'a coroutine was expected, got {main!r}')
try:
return loop.run_until_complete(main)
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
if hasattr(loop, 'shutdown_default_executor'):
loop.run_until_complete(loop.shutdown_default_executor())
@staticmethod
def _cancel_all_tasks(loop):
to_cancel = asyncio.all_tasks(loop)
if not to_cancel:
return
for task in to_cancel:
task.cancel()
# XXX: "loop" is removed in python 3.10+
loop.run_until_complete(
asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
for task in to_cancel:
if task.cancelled():
continue
if task.exception() is not None:
loop.call_exception_handler({
'message': 'unhandled exception during asyncio.run() shutdown',
'exception': task.exception(),
'task': task,
})
def merge_headers(*dicts): def merge_headers(*dicts):
"""Merge dicts of http headers case insensitively, prioritizing the latter ones""" """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))} return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}