Compare commits

...

8 Commits

Author SHA1 Message Date
Subrat Lima
714443bce7
Merge 2c0244cb2f into c699bafc50 2024-11-16 11:37:08 +01:00
bashonly
c699bafc50 [ie/soop] Fix thumbnail extraction (#11545)
Closes #11537

Authored by: bashonly
2024-11-15 22:51:55 +00:00
bashonly
eb64ae7d5d [ie] Allow ext override for thumbnails (#11545)
Authored by: bashonly
2024-11-15 22:51:55 +00:00
Simon Sawicki
c014fbcddc
[utils] subs_list_to_dict: Add lang default parameter (#11508)
Authored by: Grub4K
2024-11-15 23:25:52 +01:00
Simon Sawicki
39d79c9b9c
[utils] Fix join_nonempty, add **kwargs to unpack (#11559)
Authored by: Grub4K
2024-11-15 22:06:15 +01:00
subrat-lima
2c0244cb2f [ie/atptour] refactored url pattern for better extensibility 2024-09-01 21:46:17 +05:30
subrat-lima
2fe0226c0f [ie/atptour] enhancement - added support for spanish pages 2024-09-01 20:17:25 +05:30
subrat-lima
485cbe4990 [ie/atptour] add extractor and updated data extraction function 2024-09-01 17:06:24 +05:30
9 changed files with 218 additions and 26 deletions

View File

@ -481,7 +481,7 @@ class TestTraversalHelpers:
'id': 'name',
'data': 'content',
'url': 'url',
}, all, {subs_list_to_dict}]) == {
}, all, {subs_list_to_dict(lang=None)}]) == {
'de': [{'url': 'https://example.com/subs/de.ass'}],
'en': [{'data': 'content'}],
}, 'subs with mandatory items missing should be filtered'
@ -507,6 +507,54 @@ class TestTraversalHelpers:
{'url': 'https://example.com/subs/en1', 'ext': 'ext'},
{'url': 'https://example.com/subs/en2', 'ext': 'ext'},
]}, '`quality` key should sort subtitle list accordingly'
assert traverse_obj([
{'name': 'de', 'url': 'https://example.com/subs/de.ass'},
{'name': 'de'},
{'name': 'en', 'content': 'content'},
{'url': 'https://example.com/subs/en'},
], [..., {
'id': 'name',
'url': 'url',
'data': 'content',
}, all, {subs_list_to_dict(lang='en')}]) == {
'de': [{'url': 'https://example.com/subs/de.ass'}],
'en': [
{'data': 'content'},
{'url': 'https://example.com/subs/en'},
],
}, 'optionally provided lang should be used if no id available'
assert traverse_obj([
{'name': 1, 'url': 'https://example.com/subs/de1'},
{'name': {}, 'url': 'https://example.com/subs/de2'},
{'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
{'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
], [..., {
'id': 'name',
'url': 'url',
'ext': 'ext',
}, all, {subs_list_to_dict(lang=None)}]) == {
'de': [
{'url': 'https://example.com/subs/de3'},
{'url': 'https://example.com/subs/de4'},
],
}, 'non str types should be ignored for id and ext'
assert traverse_obj([
{'name': 1, 'url': 'https://example.com/subs/de1'},
{'name': {}, 'url': 'https://example.com/subs/de2'},
{'name': 'de', 'ext': 1, 'url': 'https://example.com/subs/de3'},
{'name': 'de', 'ext': {}, 'url': 'https://example.com/subs/de4'},
], [..., {
'id': 'name',
'url': 'url',
'ext': 'ext',
}, all, {subs_list_to_dict(lang='de')}]) == {
'de': [
{'url': 'https://example.com/subs/de1'},
{'url': 'https://example.com/subs/de2'},
{'url': 'https://example.com/subs/de3'},
{'url': 'https://example.com/subs/de4'},
],
}, 'non str types should be replaced by default id'
def test_trim_str(self):
with pytest.raises(TypeError):
@ -525,7 +573,7 @@ class TestTraversalHelpers:
def test_unpack(self):
assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123'
assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3'
assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3'
assert unpack(join_nonempty, delim=' ')([1, 2, 3]) == '1 2 3'
with pytest.raises(TypeError):
unpack(join_nonempty)()
with pytest.raises(TypeError):

View File

@ -72,7 +72,6 @@ from yt_dlp.utils import (
intlist_to_bytes,
iri_to_uri,
is_html,
join_nonempty,
js_to_json,
limit_length,
locked_file,
@ -2158,10 +2157,6 @@ Line 1
assert int_or_none(v=10) == 10, 'keyword passed positional should call function'
assert int_or_none(scale=0.1)(10) == 100, 'call after partial application should call the function'
assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially'
assert callable(join_nonempty()), 'varargs positional should apply partially'
assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function'
if __name__ == '__main__':
unittest.main()

View File

@ -4381,7 +4381,9 @@ class YoutubeDL:
return None
for idx, t in list(enumerate(thumbnails))[::-1]:
thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg')
thumb_ext = t.get('ext') or determine_ext(t['url'], 'jpg')
if multiple:
thumb_ext = f'{t["id"]}.{thumb_ext}'
thumb_display_id = f'{label} thumbnail {t["id"]}'
thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext'))
thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext'))

View File

@ -169,6 +169,10 @@ from .asobichannel import (
AsobiChannelTagURLIE,
)
from .asobistage import AsobiStageIE
from .atptour import (
ATPTourNewsIE,
ATPTourVideoIE,
)
from .atresplayer import AtresPlayerIE
from .atscaleconf import AtScaleConfEventIE
from .atvat import ATVAtIE

View File

@ -66,6 +66,14 @@ class AfreecaTVBaseIE(InfoExtractor):
extensions={'legacy_ssl': True}), display_id,
'Downloading API JSON', 'Unable to download API JSON')
@staticmethod
def _fixup_thumb(thumb_url):
if not url_or_none(thumb_url):
return None
# Core would determine_ext as 'php' from the url, so we need to provide the real ext
# See: https://github.com/yt-dlp/yt-dlp/issues/11537
return [{'url': thumb_url, 'ext': 'jpg'}]
class AfreecaTVIE(AfreecaTVBaseIE):
IE_NAME = 'soop'
@ -155,7 +163,7 @@ class AfreecaTVIE(AfreecaTVBaseIE):
'uploader': ('writer_nick', {str}),
'uploader_id': ('bj_id', {str}),
'duration': ('total_file_duration', {int_or_none(scale=1000)}),
'thumbnail': ('thumb', {url_or_none}),
'thumbnails': ('thumb', {self._fixup_thumb}),
})
entries = []
@ -226,8 +234,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
return self.playlist_result(self._entries(data), video_id)
@staticmethod
def _entries(data):
def _entries(self, data):
# 'files' is always a list with 1 element
yield from traverse_obj(data, (
'data', lambda _, v: v['story_type'] == 'catch',
@ -238,7 +245,7 @@ class AfreecaTVCatchStoryIE(AfreecaTVBaseIE):
'title': ('title', {str}),
'uploader': ('writer_nick', {str}),
'uploader_id': ('writer_id', {str}),
'thumbnail': ('thumb', {url_or_none}),
'thumbnails': ('thumb', {self._fixup_thumb}),
'timestamp': ('write_timestamp', {int_or_none}),
}))

127
yt_dlp/extractor/atptour.py Normal file
View File

@ -0,0 +1,127 @@
import re
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from ..utils import base_url, extract_attributes, get_element_html_by_id, traverse_obj, urljoin
class ATPTourVideoIE(InfoExtractor):
IE_NAME = 'atptour:video'
_VALID_URL = r'https?://(?:www\.)?atptour\.com/(?:en|es)/video/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.atptour.com/en/video/challenger-highlights-nishikori-wins-in-como-2024',
'md5': '4721002227d98fe89afafa40eba3068d',
'info_dict': {
'id': '6361099221112',
'ext': 'mp4',
'description': 'md5:ef8afed21c52cbe4ad3409045d59f413',
'upload_date': '20240827',
'duration': 105.152,
'tags': 'count:6',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Challenger Highlights: Nishikori wins in Como 2024',
'uploader_id': '6057277721001',
'timestamp': 1724775281,
},
}, {
'url': 'https://www.atptour.com/en/video/highlights-svajda-earns-highestranked-win-of-career-vs-cerundolo-winstonsalem-2024',
'md5': 'a3829d10bdcb1829568fd88b9e6ecb15',
'info_dict': {
'id': '6360716257112',
'ext': 'mp4',
'description': 'md5:a334aeb73eac631ffab8249b1e68194c',
'upload_date': '20240820',
'duration': 139.691,
'tags': 'count:5',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Highlights: Svajda earns highest-ranked win of career vs. Cerundolo Winston-Salem 2024',
'uploader_id': '6057277721001',
'timestamp': 1724183755,
},
}, {
'url': 'https://www.atptour.com/es/video/highlights-michelsen-defeats-fucsovics-in-winston-salem-2024',
'md5': '7ba4c3aabef9eb20a1b9877f28e6f775',
'info_dict': {
'id': '6360727636112',
'ext': 'mp4',
'description': 'md5:2c5682fdfa514e508c6d947e9e9b6eeb',
'upload_date': '20240821',
'duration': 135.424,
'tags': 'count:6',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Highlights: Michelsen defeats Fucsovics in Winston-Salem 2024',
'uploader_id': '6057277721001',
'timestamp': 1724205624,
},
}, {
'url': 'https://www.atptour.com/en/video/highlights-sonego-dominates-michelsen-for-winston-salem-open-title-2024',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True)
hidden_inputs = self._hidden_inputs(webpage, 'class')
featured_videos_url = urljoin(base_url(url), hidden_inputs.get('atp_featured-videos-endpoint'))
json_data = self._download_json(featured_videos_url, display_id, fatal=False, impersonate=True)
video_data = traverse_obj(json_data, ('content', 0))
account_id = traverse_obj(video_data, ('videoAccountId'))
player_id = traverse_obj(video_data, ('videoPlayerId'))
video_id = traverse_obj(video_data, ('videoId'))
return self.url_result(
f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}', BrightcoveNewIE)
class ATPTourNewsIE(InfoExtractor):
IE_NAME = 'atptour:news'
_VALID_URL = r'https?://(?:www\.)?atptour\.com/(?:en|es)/news/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.atptour.com/en/news/sinner-zverev-cincinnati-2024-sf',
'playlist_mincount': 2,
'info_dict': {
'id': 'sinner-zverev-cincinnati-2024-sf',
'title': 'Jannik Sinner battles past Alexander Zverev to reach Cincinnati final | ATP Tour | Tennis',
'description': 'md5:30cd3df666c8a5d45731d1e85d8d43ae',
},
}, {
'url': 'https://www.atptour.com/en/news/borges-us-open-2024-this-is-tennis',
'playlist_mincount': 1,
'info_dict': {
'id': 'borges-us-open-2024-this-is-tennis',
'title': 'Nuno Borges: Building legos, facing Nadal, Cirque du Soleil & more | ATP Tour | Tennis',
'description': 'md5:aaef866660c4e3ced69118c0f6ed237a',
},
}, {
'url': 'https://www.atptour.com/es/news/popyrin-us-open-2024-feature',
'playlist_mincount': 1,
'info_dict': {
'id': 'popyrin-us-open-2024-feature',
'title': 'Alexei Popyrin: Hamilton, pollo frito y la revancha de Djokovic | ATP Tour | Tennis',
'description': 'md5:b62a35720a278c9ab8410847915dc581',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True)
title = self._html_extract_title(webpage)
description = self._og_search_description(webpage)
entries = []
first_video = get_element_html_by_id('articleVideoJSPlayer', webpage)
if first_video is not None:
attributes = extract_attributes(first_video)
account_id = traverse_obj(attributes, ('data-account'))
player_id = traverse_obj(attributes, ('data-player'))
video_id = traverse_obj(attributes, ('data-video-id'))
first_video_url = f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}'
entries.append(self.url_result(first_video_url, BrightcoveNewIE))
iframe_urls = re.findall(r'<iframe[^>]src="(https://players\.brightcove\.net/[^"]+)"', webpage)
for video_url in iframe_urls:
entries.append(self.url_result(video_url, BrightcoveNewIE))
return self.playlist_result(entries, display_id, title, description)

View File

@ -279,6 +279,7 @@ class InfoExtractor:
thumbnails: A list of dictionaries, with the following entries:
* "id" (optional, string) - Thumbnail format ID
* "url"
* "ext" (optional, string) - actual image extension if not given in URL
* "preference" (optional, int) - quality of the image
* "width" (optional, int)
* "height" (optional, int)
@ -1801,7 +1802,7 @@ class InfoExtractor:
return traverse_obj(ret, traverse) or {}
@staticmethod
def _hidden_inputs(html):
def _hidden_inputs(html, attr_list=('name', 'id')):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
hidden_inputs = {}
for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
@ -1810,7 +1811,10 @@ class InfoExtractor:
continue
if attrs.get('type') not in ('hidden', 'submit'):
continue
name = attrs.get('name') or attrs.get('id')
for attr in variadic(attr_list):
name = attrs.get(attr)
if name is not None:
break
value = attrs.get('value')
if name and value is not None:
hidden_inputs[name] = value

View File

@ -216,7 +216,7 @@ def partial_application(func):
sig = inspect.signature(func)
required_args = [
param.name for param in sig.parameters.values()
if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
if param.default is inspect.Parameter.empty
]
@ -4837,7 +4837,6 @@ def number_of_digits(number):
return len('%d' % number)
@partial_application
def join_nonempty(*values, delim='-', from_dict=None):
if from_dict is not None:
values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)

View File

@ -332,14 +332,14 @@ class _RequiredError(ExtractorError):
@typing.overload
def subs_list_to_dict(*, ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
def subs_list_to_dict(*, lang: str | None = 'und', ext: str | None = None) -> collections.abc.Callable[[list[dict]], dict[str, list[dict]]]: ...
@typing.overload
def subs_list_to_dict(subs: list[dict] | None, /, *, ext: str | None = None) -> dict[str, list[dict]]: ...
def subs_list_to_dict(subs: list[dict] | None, /, *, lang: str | None = 'und', ext: str | None = None) -> dict[str, list[dict]]: ...
def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
def subs_list_to_dict(subs: list[dict] | None = None, /, *, lang='und', ext=None):
"""
Convert subtitles from a traversal into a subtitle dict.
The path should have an `all` immediately before this function.
@ -352,7 +352,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
`quality` The sort order for each subtitle
"""
if subs is None:
return functools.partial(subs_list_to_dict, ext=ext)
return functools.partial(subs_list_to_dict, lang=lang, ext=ext)
result = collections.defaultdict(list)
@ -360,9 +360,15 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
if not url_or_none(sub.get('url')) and not sub.get('data'):
continue
sub_id = sub.pop('id', None)
if sub_id is None:
if not isinstance(sub_id, str):
if not lang:
continue
if ext is not None and not sub.get('ext'):
sub_id = lang
sub_ext = sub.get('ext')
if not isinstance(sub_ext, str):
if not ext:
sub.pop('ext', None)
else:
sub['ext'] = ext
result[sub_id].append(sub)
result = dict(result)
@ -452,9 +458,9 @@ def trim_str(*, start=None, end=None):
return trim
def unpack(func):
def unpack(func, **kwargs):
@functools.wraps(func)
def inner(items, **kwargs):
def inner(items):
return func(*items, **kwargs)
return inner