Compare commits

...

11 Commits

Author SHA1 Message Date
Kieran
37c0e9da33
Merge 19fa262ecb into c699bafc50 2024-11-16 02:21:27 +00:00
bashonly
19fa262ecb
ruff fixes
Authored by: bashonly
2024-11-15 20:21:08 -06:00
bashonly
709fa17030
Merge branch 'yt-dlp:master' into pr/9894 2024-11-15 20:20:31 -06:00
bashonly
c699bafc50 [ie/soop] Fix thumbnail extraction (#11545)
Closes #11537

Authored by: bashonly
2024-11-15 22:51:55 +00:00
bashonly
eb64ae7d5d [ie] Allow ext override for thumbnails (#11545)
Authored by: bashonly
2024-11-15 22:51:55 +00:00
Simon Sawicki
c014fbcddc
[utils] subs_list_to_dict: Add lang default parameter (#11508)
Authored by: Grub4K
2024-11-15 23:25:52 +01:00
Simon Sawicki
39d79c9b9c
[utils] Fix join_nonempty, add **kwargs to unpack (#11559)
Authored by: Grub4K
2024-11-15 22:06:15 +01:00
Kieran Eglin
311dc3d438
Addl. regex refactoring 2024-05-09 10:57:13 -07:00
Kieran Eglin
620d721779
Refactored based on feedback 2024-05-09 10:55:22 -07:00
Kieran Eglin
6d23661542
Updated playlist regex 2024-05-09 10:37:51 -07:00
Kieran Eglin
d816fb28dc
Added 30 day singer extractor 2024-05-09 09:39:23 -07:00
9 changed files with 187 additions and 24 deletions

View File

@ -481,7 +481,7 @@ class TestTraversalHelpers:
'id': 'name', 'id': 'name',
'data': 'content', 'data': 'content',
'url': 'url', 'url': 'url',
}, all, {subs_list_to_dict}]) == { }, all, {subs_list_to_dict(lang=None)}]) == {
'de': [{'url': 'https://example.com/subs/de.ass'}], 'de': [{'url': 'https://example.com/subs/de.ass'}],
'en': [{'data': 'content'}], 'en': [{'data': 'content'}],
}, 'subs with mandatory items missing should be filtered' }, 'subs with mandatory items missing should be filtered'
@ -507,6 +507,54 @@ class TestTraversalHelpers:
{'url': 'https://example.com/subs/en1', 'ext': 'ext'}, {'url': 'https://example.com/subs/en1', 'ext': 'ext'},
{'url': 'https://example.com/subs/en2', 'ext': 'ext'}, {'url': 'https://example.com/subs/en2', 'ext': 'ext'},
]}, '`quality` key should sort subtitle list accordingly' ]}, '`quality` key should sort subtitle list accordingly'
assert traverse_obj([
{'name': 'de', 'url': 'https://example.com/subs/de.ass'},
{'name': 'de'},
{'name': 'en', 'content': 'content'},
{'url': 'https://example.com/subs/en'},
], [..., {
'id': 'name',
'url': 'url',
'data': 'content',
}, all, {subs_list_to_dict(lang='en')}]) == {
'de': [{'url': 'https://example.com/subs/de.ass'}],
'en': [
{'data': 'content'},
{'url': 'https://example.com/subs/en'},
],
}, 'optionally provided lang should be used if no id available'
assert traverse_obj([
{'name': 1, 'url': 'https://example.com/subs/de1'},
{'name': {}, 'url': 'https://example.com/subs/de2'},
{'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
{'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
], [..., {
'id': 'name',
'url': 'url',
'ext': 'ext',
}, all, {subs_list_to_dict(lang=None)}]) == {
'de': [
{'url': 'https://example.com/subs/de3'},
{'url': 'https://example.com/subs/de4'},
],
}, 'non str types should be ignored for id and ext'
assert traverse_obj([
{'name': 1, 'url': 'https://example.com/subs/de1'},
{'name': {}, 'url': 'https://example.com/subs/de2'},
{'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
{'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
], [..., {
'id': 'name',
'url': 'url',
'ext': 'ext',
}, all, {subs_list_to_dict(lang='de')}]) == {
'de': [
{'url': 'https://example.com/subs/de1'},
{'url': 'https://example.com/subs/de2'},
{'url': 'https://example.com/subs/de3'},
{'url': 'https://example.com/subs/de4'},
],
}, 'non str types should be replaced by default id'
def test_trim_str(self): def test_trim_str(self):
with pytest.raises(TypeError): with pytest.raises(TypeError):
@ -525,7 +573,7 @@ class TestTraversalHelpers:
def test_unpack(self): def test_unpack(self):
assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123' assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123'
assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3' assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3'
assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3' assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3'
with pytest.raises(TypeError): with pytest.raises(TypeError):
unpack(join_nonempty)() unpack(join_nonempty)()
with pytest.raises(TypeError): with pytest.raises(TypeError):

View File

@ -72,7 +72,6 @@ from yt_dlp.utils import (
intlist_to_bytes, intlist_to_bytes,
iri_to_uri, iri_to_uri,
is_html, is_html,
join_nonempty,
js_to_json, js_to_json,
limit_length, limit_length,
locked_file, locked_file,
@ -2158,10 +2157,6 @@ Line 1
assert int_or_none(v=10) == 10, 'keyword passed positional should call function' assert int_or_none(v=10) == 10, 'keyword passed positional should call function'
assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function' assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function'
assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially'
assert callable(join_nonempty()), 'varargs positional should apply partially'
assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function'
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -4381,7 +4381,9 @@ class YoutubeDL:
return None return None
for idx, t in list(enumerate(thumbnails))[::-1]: for idx, t in list(enumerate(thumbnails))[::-1]:
thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg')
if multiple:
thumb_ext = f'{t["id"]}.{thumb_ext}'
thumb_display_id = f'{label} thumbnail {t["id"]}' thumb_display_id = f'{label} thumbnail {t["id"]}'
thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))

View File

@ -2083,6 +2083,7 @@ from .theplatform import (
from .thestar import TheStarIE from .thestar import TheStarIE
from .thesun import TheSunIE from .thesun import TheSunIE
from .theweatherchannel import TheWeatherChannelIE from .theweatherchannel import TheWeatherChannelIE
from .thirtydaysinger import ThirtyDaySingerIE, ThirtyDaySingerPlaylistIE
from .thisamericanlife import ThisAmericanLifeIE from .thisamericanlife import ThisAmericanLifeIE
from .thisoldhouse import ThisOldHouseIE from .thisoldhouse import ThisOldHouseIE
from .thisvid import ( from .thisvid import (

View File

@ -66,6 +66,14 @@ class AfreecaTVBaseIE(InfoExtractor):
extensions={'legacy_ssl': True}), display_id, extensions={'legacy_ssl': True}), display_id,
'Downloading API JSON', 'Unable to download API JSON') 'Downloading API JSON', 'Unable to download API JSON')
@staticmethod
def _fixup_thumb(thumb_url):
if not url_or_none(thumb_url):
return None
# Core would determine_ext as 'php' from the url, so we need to provide the real ext
# See: https://github.com/yt-dlp/yt-dlp/issues/11537
return [{'url': thumb_url, 'ext': 'jpg'}]
class AfreecaTVIE(AfreecaTVBaseIE): class AfreecaTVIE(AfreecaTVBaseIE):
IE_NAME = 'soop' IE_NAME = 'soop'
@ -155,7 +163,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'uploader': ('writer_nick', {str}), 'uploader': ('writer_nick', {str}),
'uploader_id': ('bj_id', {str}), 'uploader_id': ('bj_id', {str}),
'duration': ('total_file_duration', {int_or_none(scale=1000)}), 'duration': ('total_file_duration', {int_or_none(scale=1000)}),
'thumbnail': ('thumb', {url_or_none}), 'thumbnails': ('thumb', {self._fixup_thumb}),
}) })
entries = [] entries = []
@ -226,8 +234,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
return self.playlist_result(self._entries(data), video_id) return self.playlist_result(self._entries(data), video_id)
@staticmethod def _entries(self, data):
def _entries(data):
# 'files' is always a list with 1 element # 'files' is always a list with 1 element
yield from traverse_obj(data, ( yield from traverse_obj(data, (
'data', lambda _, v: v['story_type'] == 'catch', 'data', lambda _, v: v['story_type'] == 'catch',
@ -238,7 +245,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
'title': ('title', {str}), 'title': ('title', {str}),
'uploader': ('writer_nick', {str}), 'uploader': ('writer_nick', {str}),
'uploader_id': ('writer_id', {str}), 'uploader_id': ('writer_id', {str}),
'thumbnail': ('thumb', {url_or_none}), 'thumbnails': ('thumb', {self._fixup_thumb}),
'timestamp': ('write_timestamp', {int_or_none}), 'timestamp': ('write_timestamp', {int_or_none}),
})) }))

View File

@ -279,6 +279,7 @@ class InfoExtractor:
thumbnails: A list of dictionaries, with the following entries: thumbnails: A list of dictionaries, with the following entries:
* "id" (optional, string) - Thumbnail format ID * "id" (optional, string) - Thumbnail format ID
* "url" * "url"
* "ext" (optional, string) - actual image extension if not given in URL
* "preference" (optional, int) - quality of the image * "preference" (optional, int) - quality of the image
* "width" (optional, int) * "width" (optional, int)
* "height" (optional, int) * "height" (optional, int)

View File

@ -0,0 +1,104 @@
import re
from .wistia import WistiaBaseIE
from ..utils import clean_html, get_elements_html_by_class
class ThirtyDaySingerBase(WistiaBaseIE):
_INDEX_EXTRACTION_RE = r'/tutorial/[\w-]+/(?P<index>[\w-]+)'
def _extract_for_url(self, url):
lesson_index = re.search(self._INDEX_EXTRACTION_RE, url).group('index')
webpage = self._download_webpage(url, lesson_index)
match = next(self._extract_wistia_async_embed(webpage))
embed_config = self._download_embed_config('medias', match.group('id'), url)
embed_infojson = self._extract_media(embed_config)
webpage_infojson = self._extract_webpage_data(webpage)
return {**embed_infojson, **webpage_infojson}
def _extract_webpage_data(self, webpage):
title = self._html_search_regex(r'<h1>([^<]+)</h1>', webpage, 'title')
fallback_title = self._html_extract_title(webpage)
description = self._html_search_meta('description', webpage, fatal=False)
return {
'title': title or fallback_title,
'description': clean_html(self._format_html_list(description)),
}
# The site makes extensive use of HTML lists for formatting and `clean_html`
# doesn't handle them well. This is needed to keep lists readable.
def _format_html_list(self, html):
replacements = {
'<ul>': '<br>',
'</ul': '<br>',
'<li>': '<br>- ',
'</li>': '',
}
for k, v in replacements.items():
html = html.replace(k, v)
return html
class ThirtyDaySingerIE(ThirtyDaySingerBase):
_VALID_URL = r'https?://www\.30daysinger\.com/tutorial/[\w-]+/[\w-]+'
_TESTS = [{
'url': 'https://www.30daysinger.com/tutorial/30-day-beginner-course-with-jonathan-estabrooks/1',
'md5': '56bb11529b9777899b27b599d4b16cf6',
'info_dict': {
'id': 'tegd38l3d5',
'ext': 'mp4',
'thumbnail': 'http://embed.wistia.com/deliveries/c26a85cb98e32efa8a5e12a0576e63355af66230.jpg',
'upload_date': '20190608',
'description': 'md5:d3291de8988be57b1d3e411126ba4d33',
'duration': 344.22,
'timestamp': 1559952526,
'title': 'Welcome to 30 Day Singer',
},
}]
def _real_extract(self, url):
return self._extract_for_url(url)
class ThirtyDaySingerPlaylistIE(ThirtyDaySingerBase):
_URI_BASE = 'https://www.30daysinger.com'
_VALID_URL = r'https?://www\.30daysinger\.com/tutorial/(?P<playlist_id>[\w-]+)/?(?:$|[#?])'
_TESTS = [{
'url': 'https://www.30daysinger.com/tutorial/30-day-beginner-course-with-jonathan-estabrooks',
'info_dict': {
'id': '30-day-beginner-course-with-jonathan-estabrooks',
'description': 'md5:8cf6d6c7c377895653c9cde9dfc4104f',
'title': '30 Day Beginner Course with Jonathan Estabrooks',
},
'playlist_count': 1,
'expected_warnings': ['This video is for premium members only'],
}]
def _real_extract(self, url):
playlist_id = self._match_valid_url(url).group('playlist_id')
webpage = self._download_webpage(url, playlist_id)
playlist_attrs = self._extract_webpage_data(webpage)
entries = []
for html_element in get_elements_html_by_class('playlist-item-link', webpage):
href = self._search_regex(r'href="([^"]+)"', html_element, 'href')
if not href:
continue
# Often _some_ content is free so we should still download that but warn the user
# when we encounter premium content.
# NOTE: this only applies to the playlist extractor, not the single video extractor
if 'upgrade' in href:
self.report_warning('This video is for premium members only')
continue
entries.append(self._extract_for_url(self._URI_BASE + href))
return self.playlist_result(entries, playlist_id, playlist_attrs['title'], playlist_attrs['description'])

View File

@ -216,7 +216,7 @@ def partial_application(func):
sig = inspect.signature(func) sig = inspect.signature(func)
required_args = [ required_args = [
param.name for param in sig.parameters.values() param.name for param in sig.parameters.values()
if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL) if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
if param.default is inspect.Parameter.empty if param.default is inspect.Parameter.empty
] ]
@ -4837,7 +4837,6 @@ def number_of_digits(number):
return len('%d' % number) return len('%d' % number)
@partial_application
def join_nonempty(*values, delim='-', from_dict=None): def join_nonempty(*values, delim='-', from_dict=None):
if from_dict is not None: if from_dict is not None:
values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)

View File

@ -332,14 +332,14 @@ class _RequiredError(ExtractorError):
@typing.overload @typing.overload
def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ... def subs_list_to_dict(*, lang: str | None = 'und', ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
@typing.overload @typing.overload
def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ... def subs_list_to_dict(subs: list[dict] | None, /, *, lang: str | None = 'und', ext: str | None = None) -> dict[str, list[dict]]: ...
def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None):
""" """
Convert subtitles from a traversal into a subtitle dict. Convert subtitles from a traversal into a subtitle dict.
The path should have an `all` immediately before this function. The path should have an `all` immediately before this function.
@ -352,7 +352,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
`quality` The sort order for each subtitle `quality` The sort order for each subtitle
""" """
if subs is None: if subs is None:
return functools.partial(subs_list_to_dict, ext=ext) return functools.partial(subs_list_to_dict, lang=lang, ext=ext)
result = collections.defaultdict(list) result = collections.defaultdict(list)
@ -360,10 +360,16 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
if not url_or_none(sub.get('url')) and not sub.get('data'): if not url_or_none(sub.get('url')) and not sub.get('data'):
continue continue
sub_id = sub.pop('id', None) sub_id = sub.pop('id', None)
if sub_id is None: if not isinstance(sub_id, str):
continue if not lang:
if ext is not None and not sub.get('ext'): continue
sub['ext'] = ext sub_id = lang
sub_ext = sub.get('ext')
if not isinstance(sub_ext, str):
if not ext:
sub.pop('ext', None)
else:
sub['ext'] = ext
result[sub_id].append(sub) result[sub_id].append(sub)
result = dict(result) result = dict(result)
@ -452,9 +458,9 @@ def trim_str(*, start=None, end=None):
return trim return trim
def unpack(func): def unpack(func, **kwargs):
@functools.wraps(func) @functools.wraps(func)
def inner(items, **kwargs): def inner(items):
return func(*items, **kwargs) return func(*items, **kwargs)
return inner return inner