Compare commits

..

8 Commits

Author SHA1 Message Date
coletdjnz
4e643276c0
tidy up requirements 2023-10-09 20:41:24 +13:00
coletdjnz
cf87f4a40b
Fix compat options 2022 reference on README 2023-10-09 20:40:58 +13:00
coletdjnz
96427fc45b
Merge remote-tracking branch 'upstream/master' into net-backend/requests 2023-10-09 20:17:30 +13:00
coletdjnz
07d604a1cb
quotes 2023-10-09 20:12:53 +13:00
coletdjnz
ab23f57a88
Update yt_dlp/networking/_requests.py
Co-authored-by: Simon Sawicki <accounts@grub4k.xyz>
2023-10-09 07:00:10 +00:00
coletdjnz
324dc382cc
check incomplete read match for error response tests 2023-10-09 19:39:33 +13:00
Simon Sawicki
1c51c520f7
[fd/fragment] Improve progress calculation (#8241)
This uses the download speed from all threads and also adds smoothing to speed and eta

Authored by: Grub4K
2023-10-08 02:01:01 +02:00
Awal Garg
9d7ded6419
[utils] js_to_json: Fix Date constructor parsing (#8295)
Authored by: awalgarg, Grub4K
2023-10-08 01:57:23 +02:00
8 changed files with 159 additions and 53 deletions

View File

@ -166,7 +166,7 @@ For ease of use, a few more compat options are available:
* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter`
* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter`
* `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date`
* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress`. Use this to enable all future compat options
* `--compat-options 2022`: Same as `--compat-options playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler`. Use this to enable all future compat options
# INSTALLATION

View File

@ -4,5 +4,5 @@ websockets
brotli; platform_python_implementation=='CPython'
brotlicffi; platform_python_implementation!='CPython'
certifi
requests>=2.31.0,<3.0
requests>=2.31.0,<3
urllib3>=1.26.17,<3

View File

@ -852,20 +852,26 @@ class TestRequestsRequestHandler(TestRequestHandlerBase):
assert exc_info.type is expected
@pytest.mark.parametrize('raised,expected', [
(lambda: urllib3.exceptions.SSLError(), SSLError),
(lambda: urllib3.exceptions.TimeoutError(), TransportError),
(lambda: urllib3.exceptions.ReadTimeoutError(None, None, None), TransportError),
(lambda: urllib3.exceptions.ProtocolError(), TransportError),
(lambda: urllib3.exceptions.ProtocolError(
'error', http.client.IncompleteRead(partial=b'')), IncompleteRead),
(lambda: urllib3.exceptions.IncompleteRead(partial=3, expected=5), IncompleteRead),
(lambda: urllib3.exceptions.DecodeError(), TransportError),
(lambda: urllib3.exceptions.HTTPError(), TransportError) # catch-all
@pytest.mark.parametrize('raised,expected,match', [
(lambda: urllib3.exceptions.SSLError(), SSLError, None),
(lambda: urllib3.exceptions.TimeoutError(), TransportError, None),
(lambda: urllib3.exceptions.ReadTimeoutError(None, None, None), TransportError, None),
(lambda: urllib3.exceptions.ProtocolError(), TransportError, None),
(lambda: urllib3.exceptions.DecodeError(), TransportError, None),
(lambda: urllib3.exceptions.HTTPError(), TransportError, None), # catch-all
(
lambda: urllib3.exceptions.ProtocolError('error', http.client.IncompleteRead(partial=b'abc', expected=4)),
IncompleteRead,
'3 bytes read, 4 more expected'
),
(
lambda: urllib3.exceptions.IncompleteRead(partial=3, expected=5),
IncompleteRead,
'3 bytes read, 5 more expected'
),
])
@pytest.mark.skipif(requests is None, reason='requests is not installed')
@pytest.mark.skipif(urllib3 is None, reason='urllib3 is not installed')
def test_response_error_mapping(self, monkeypatch, raised, expected):
@pytest.mark.parametrize('handler', ['Requests'], indirect=True)
def test_response_error_mapping(self, handler, monkeypatch, raised, expected, match):
from urllib3.response import HTTPResponse as Urllib3Response
from requests.models import Response as RequestsResponse
from yt_dlp.networking._requests import RequestsResponseAdapter
@ -875,9 +881,9 @@ class TestRequestsRequestHandler(TestRequestHandlerBase):
def mock_read(*args, **kwargs):
raise raised()
monkeypatch.setattr(res.fp, "read", mock_read)
monkeypatch.setattr(res.fp, 'read', mock_read)
with pytest.raises(expected) as exc_info:
with pytest.raises(expected, match=match) as exc_info:
res.read()
assert exc_info.type is expected

View File

@ -1209,6 +1209,9 @@ class TestUtil(unittest.TestCase):
on = js_to_json('\'"\\""\'')
self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped')
on = js_to_json('[new Date("spam"), \'("eggs")\']')
self.assertEqual(json.loads(on), ['spam', '("eggs")'], msg='Date regex should match a single string')
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
@ -1220,11 +1223,13 @@ class TestUtil(unittest.TestCase):
self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""')
self.assertEqual(js_to_json('`${name}`', {}), '"name"')
def test_js_to_json_map_array_constructors(self):
def test_js_to_json_common_constructors(self):
self.assertEqual(json.loads(js_to_json('new Map([["a", 5]])')), {'a': 5})
self.assertEqual(json.loads(js_to_json('Array(5, 10)')), [5, 10])
self.assertEqual(json.loads(js_to_json('new Array(15,5)')), [15, 5])
self.assertEqual(json.loads(js_to_json('new Map([Array(5, 10),new Array(15,5)])')), {'5': 10, '15': 5})
self.assertEqual(json.loads(js_to_json('new Date("123")')), "123")
self.assertEqual(json.loads(js_to_json('new Date(\'2023-10-19\')')), "2023-10-19")
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})

View File

@ -14,6 +14,7 @@ from ..networking import Request
from ..networking.exceptions import HTTPError, IncompleteRead
from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj
from ..utils.networking import HTTPHeaderDict
from ..utils.progress import ProgressCalculator
class HttpQuietDownloader(HttpFD):
@ -226,8 +227,7 @@ class FragmentFD(FileDownloader):
resume_len = ctx['complete_frags_downloaded_bytes']
total_frags = ctx['total_frags']
ctx_id = ctx.get('ctx_id')
# This dict stores the download progress, it's updated by the progress
# hook
# Stores the download progress, updated by the progress hook
state = {
'status': 'downloading',
'downloaded_bytes': resume_len,
@ -237,14 +237,8 @@ class FragmentFD(FileDownloader):
'tmpfilename': ctx['tmpfilename'],
}
start = time.time()
ctx.update({
'started': start,
'fragment_started': start,
# Amount of fragment's bytes downloaded by the time of the previous
# frag progress hook invocation
'prev_frag_downloaded_bytes': 0,
})
ctx['started'] = time.time()
progress = ProgressCalculator(resume_len)
def frag_progress_hook(s):
if s['status'] not in ('downloading', 'finished'):
@ -259,38 +253,35 @@ class FragmentFD(FileDownloader):
state['max_progress'] = ctx.get('max_progress')
state['progress_idx'] = ctx.get('progress_idx')
time_now = time.time()
state['elapsed'] = time_now - start
state['elapsed'] = progress.elapsed
frag_total_bytes = s.get('total_bytes') or 0
s['fragment_info_dict'] = s.pop('info_dict', {})
# XXX: Fragment resume is not accounted for here
if not ctx['live']:
estimated_size = (
(ctx['complete_frags_downloaded_bytes'] + frag_total_bytes)
/ (state['fragment_index'] + 1) * total_frags)
state['total_bytes_estimate'] = estimated_size
progress.total = estimated_size
progress.update(s.get('downloaded_bytes'))
state['total_bytes_estimate'] = progress.total
else:
progress.update(s.get('downloaded_bytes'))
if s['status'] == 'finished':
state['fragment_index'] += 1
ctx['fragment_index'] = state['fragment_index']
state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']
ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']
ctx['speed'] = state['speed'] = self.calc_speed(
ctx['fragment_started'], time_now, frag_total_bytes)
ctx['fragment_started'] = time.time()
ctx['prev_frag_downloaded_bytes'] = 0
else:
frag_downloaded_bytes = s['downloaded_bytes']
state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes']
ctx['speed'] = state['speed'] = self.calc_speed(
ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0))
if not ctx['live']:
state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes'])
ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
progress.thread_reset()
state['downloaded_bytes'] = ctx['complete_frags_downloaded_bytes'] = progress.downloaded
state['speed'] = ctx['speed'] = progress.speed.smooth
state['eta'] = progress.eta.smooth
self._hook_progress(state, info_dict)
ctx['dl'].add_progress_hook(frag_progress_hook)
return start
return ctx['started']
def _finish_frag_download(self, ctx, info_dict):
ctx['dest_stream'].close()
@ -500,7 +491,6 @@ class FragmentFD(FileDownloader):
download_fragment(fragment, ctx_copy)
return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized')
self.report_warning('The download speed shown is only of one thread. This is a known issue')
with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool:
try:
for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments):

View File

@ -14,11 +14,7 @@ if requests is None:
if urllib3 is None:
raise ImportError('urllib3 module is not installed')
urllib3_version = urllib3.__version__.split('.')
if len(urllib3_version) == 2:
urllib3_version.append('0')
urllib3_version = tuple(map(functools.partial(int_or_none, default=0), urllib3_version[:3]))
urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
if urllib3_version < (1, 26, 17):
raise ImportError('Only urllib3 >= 1.26.17 is supported')

View File

@ -2744,7 +2744,7 @@ def js_to_json(code, vars={}, *, strict=False):
code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
if not strict:
code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)

109
yt_dlp/utils/progress.py Normal file
View File

@ -0,0 +1,109 @@
from __future__ import annotations
import bisect
import threading
import time
class ProgressCalculator:
# Time to calculate the speed over (seconds)
SAMPLING_WINDOW = 3
# Minimum timeframe before to sample next downloaded bytes (seconds)
SAMPLING_RATE = 0.05
# Time before showing eta (seconds)
GRACE_PERIOD = 1
def __init__(self, initial: int):
self._initial = initial or 0
self.downloaded = self._initial
self.elapsed: float = 0
self.speed = SmoothValue(0, smoothing=0.7)
self.eta = SmoothValue(None, smoothing=0.9)
self._total = 0
self._start_time = time.monotonic()
self._last_update = self._start_time
self._lock = threading.Lock()
self._thread_sizes: dict[int, int] = {}
self._times = [self._start_time]
self._downloaded = [self.downloaded]
@property
def total(self):
return self._total
@total.setter
def total(self, value: int | None):
with self._lock:
if value is not None and value < self.downloaded:
value = self.downloaded
self._total = value
def thread_reset(self):
current_thread = threading.get_ident()
with self._lock:
self._thread_sizes[current_thread] = 0
def update(self, size: int | None):
if not size:
return
current_thread = threading.get_ident()
with self._lock:
last_size = self._thread_sizes.get(current_thread, 0)
self._thread_sizes[current_thread] = size
self._update(size - last_size)
def _update(self, size: int):
current_time = time.monotonic()
self.downloaded += size
self.elapsed = current_time - self._start_time
if self.total is not None and self.downloaded > self.total:
self._total = self.downloaded
if self._last_update + self.SAMPLING_RATE > current_time:
return
self._last_update = current_time
self._times.append(current_time)
self._downloaded.append(self.downloaded)
offset = bisect.bisect_left(self._times, current_time - self.SAMPLING_WINDOW)
del self._times[:offset]
del self._downloaded[:offset]
if len(self._times) < 2:
self.speed.reset()
self.eta.reset()
return
download_time = current_time - self._times[0]
if not download_time:
return
self.speed.set((self.downloaded - self._downloaded[0]) / download_time)
if self.total and self.speed.value and self.elapsed > self.GRACE_PERIOD:
self.eta.set((self.total - self.downloaded) / self.speed.value)
else:
self.eta.reset()
class SmoothValue:
def __init__(self, initial: float | None, smoothing: float):
self.value = self.smooth = self._initial = initial
self._smoothing = smoothing
def set(self, value: float):
self.value = value
if self.smooth is None:
self.smooth = self.value
else:
self.smooth = (1 - self._smoothing) * value + self._smoothing * self.smooth
def reset(self):
self.value = self.smooth = self._initial