Compare commits

..

9 Commits

Author SHA1 Message Date
bashonly
758cadf123
[ie/learningonscreen] raise login required with new session_cookies method
Authored by: bashonly
2024-11-03 13:28:06 -06:00
bashonly
ad786342e2
another one
Authored by: bashonly
2024-11-03 13:25:17 -06:00
Simon Sawicki
e6226206fc
*shrug* 2024-11-03 20:23:35 +01:00
Simon Sawicki
21c9c265ca
Some more adjustments 2024-11-03 20:12:50 +01:00
bashonly
a198c6f8eb
Even more traversal helper cleanup
Authored by: bashonly
2024-11-03 13:05:15 -06:00
bashonly
962edb31ae
Merge branch 'yt-dlp:master' into misc-cleanup-another-one 2024-11-03 12:21:22 -06:00
bashonly
b103aca24d
[utils] Fix and improve find_element and find_elements (#11443)
Fix d710a6ca7c

Authored by: bashonly, Grub4K

Co-authored-by: Simon Sawicki <contact@grub4k.xyz>
2024-11-03 18:19:45 +00:00
bashonly
e705746e27
Allow videos with .vid extension 2024-11-03 10:31:29 -06:00
bashonly
4f7f711a7b
Allow thumbnails with .jfif extension
Authored by: bashonly
2024-11-03 10:26:15 -06:00
13 changed files with 160 additions and 127 deletions

View File

@ -13,6 +13,8 @@ from yt_dlp.utils import (
str_or_none, str_or_none,
) )
from yt_dlp.utils.traversal import ( from yt_dlp.utils.traversal import (
find_element,
find_elements,
require, require,
subs_list_to_dict, subs_list_to_dict,
traverse_obj, traverse_obj,
@ -37,6 +39,14 @@ _TEST_DATA = {
'dict': {}, 'dict': {},
} }
_TEST_HTML = '''<html><body>
<div class="a">1</div>
<div class="a" id="x" custom="z">2</div>
<div class="b" data-id="y" custom="z">3</div>
<p class="a">4</p>
<p id="d" custom="e">5</p>
</body></html>'''
class TestTraversal: class TestTraversal:
def test_traversal_base(self): def test_traversal_base(self):
@ -521,6 +531,50 @@ class TestTraversalHelpers:
with pytest.raises(TypeError): with pytest.raises(TypeError):
unpack() unpack()
def test_find_element(self):
for improper_kwargs in [
dict(attr='data-id'),
dict(value='y'),
dict(attr='data-id', value='y', cls='a'),
dict(attr='data-id', value='y', id='x'),
dict(cls='a', id='x'),
dict(cls='a', tag='p'),
dict(cls='[ab]', regex=True),
]:
with pytest.raises(AssertionError):
find_element(**improper_kwargs)(_TEST_HTML)
assert find_element(cls='a')(_TEST_HTML) == '1'
assert find_element(cls='a', html=True)(_TEST_HTML) == '<div class="a">1</div>'
assert find_element(id='x')(_TEST_HTML) == '2'
assert find_element(id='[ex]')(_TEST_HTML) is None
assert find_element(id='[ex]', regex=True)(_TEST_HTML) == '2'
assert find_element(id='x', html=True)(_TEST_HTML) == '<div class="a" id="x" custom="z">2</div>'
assert find_element(attr='data-id', value='y')(_TEST_HTML) == '3'
assert find_element(attr='data-id', value='y(?:es)?')(_TEST_HTML) is None
assert find_element(attr='data-id', value='y(?:es)?', regex=True)(_TEST_HTML) == '3'
assert find_element(
attr='data-id', value='y', html=True)(_TEST_HTML) == '<div class="b" data-id="y" custom="z">3</div>'
def test_find_elements(self):
for improper_kwargs in [
dict(tag='p'),
dict(attr='data-id'),
dict(value='y'),
dict(attr='data-id', value='y', cls='a'),
dict(cls='a', tag='div'),
dict(cls='[ab]', regex=True),
]:
with pytest.raises(AssertionError):
find_elements(**improper_kwargs)(_TEST_HTML)
assert find_elements(cls='a')(_TEST_HTML) == ['1', '2', '4']
assert find_elements(cls='a', html=True)(_TEST_HTML) == [
'<div class="a">1</div>', '<div class="a" id="x" custom="z">2</div>', '<p class="a">4</p>']
assert find_elements(attr='custom', value='z')(_TEST_HTML) == ['2', '3']
assert find_elements(attr='custom', value='[ez]')(_TEST_HTML) == []
assert find_elements(attr='custom', value='[ez]', regex=True)(_TEST_HTML) == ['2', '3', '5']
class TestDictGet: class TestDictGet:
def test_dict_get(self): def test_dict_get(self):

View File

@ -1,4 +1,3 @@
import functools
import json import json
import random import random
import re import re
@ -10,7 +9,6 @@ from ..utils import (
ExtractorError, ExtractorError,
extract_attributes, extract_attributes,
float_or_none, float_or_none,
get_element_html_by_id,
int_or_none, int_or_none,
parse_filesize, parse_filesize,
str_or_none, str_or_none,
@ -21,7 +19,7 @@ from ..utils import (
url_or_none, url_or_none,
urljoin, urljoin,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import find_element, traverse_obj
class BandcampIE(InfoExtractor): class BandcampIE(InfoExtractor):
@ -511,7 +509,7 @@ class BandcampUserIE(InfoExtractor):
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
yield from traverse_obj(webpage, ( yield from traverse_obj(webpage, (
{functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes}, {find_element(id='music-grid', html=True)}, {extract_attributes},
'data-client-items', {json.loads}, ..., 'page_url', {str})) 'data-client-items', {json.loads}, ..., 'page_url', {str}))
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -1,35 +1,20 @@
import functools
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html, clean_html,
extract_attributes, extract_attributes,
get_element_text_and_html_by_tag,
get_elements_by_class,
join_nonempty, join_nonempty,
js_to_json, js_to_json,
mimetype2ext, mimetype2ext,
unified_strdate, unified_strdate,
url_or_none, url_or_none,
urljoin, urljoin,
variadic,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import (
find_element,
traverse_obj,
def html_get_element(tag=None, cls=None): )
assert tag or cls, 'One of tag or class is required'
if cls:
func = functools.partial(get_elements_by_class, cls, tag=tag)
else:
func = functools.partial(get_element_text_and_html_by_tag, tag)
def html_get_element_wrapper(html):
return variadic(func(html))[0]
return html_get_element_wrapper
class BpbIE(InfoExtractor): class BpbIE(InfoExtractor):
@ -41,12 +26,12 @@ class BpbIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '297', 'id': '297',
'ext': 'mp4', 'ext': 'mp4',
'creator': 'Kooperative Berlin', 'creators': ['Kooperative Berlin'],
'description': 'md5:f4f75885ba009d3e2b156247a8941ce6', 'description': r're:Joachim Gauck, .*\n\nKamera: .*',
'release_date': '20160115', 'release_date': '20150716',
'series': 'Interview auf dem Geschichtsforum 1989 | 2009', 'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'], 'tags': [],
'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D', 'thumbnail': r're:https?://www\.bpb\.de/cache/images/7/297_teaser_16x9_1240\.jpg.*',
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
'uploader': 'Bundeszentrale für politische Bildung', 'uploader': 'Bundeszentrale für politische Bildung',
}, },
@ -55,11 +40,12 @@ class BpbIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '522184', 'id': '522184',
'ext': 'mp4', 'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', 'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
'description': 'md5:f83c795ff8f825a69456a9e51fc15903', 'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
'release_date': '20230621', 'release_date': '20230621',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], 'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB', 'tags': [],
'thumbnail': r're:https://www\.bpb\.de/cache/images/4/522184_teaser_16x9_1240\.png.*',
'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c', 'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
'uploader': 'Bundeszentrale für politische Bildung', 'uploader': 'Bundeszentrale für politische Bildung',
}, },
@ -68,11 +54,12 @@ class BpbIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '518789', 'id': '518789',
'ext': 'mp4', 'ext': 'mp4',
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', 'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8', 'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
'release_date': '20230302', 'release_date': '20230302',
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], 'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D', 'tags': [],
'thumbnail': r're:https://www\.bpb\.de/cache/images/9/518789_teaser_16x9_1240\.jpeg.*',
'title': 'md5:3e956f264bb501f6383f10495a401da4', 'title': 'md5:3e956f264bb501f6383f10495a401da4',
'uploader': 'Bundeszentrale für politische Bildung', 'uploader': 'Bundeszentrale für politische Bildung',
}, },
@ -84,12 +71,12 @@ class BpbIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '315813', 'id': '315813',
'ext': 'mp3', 'ext': 'mp3',
'creator': 'Axel Schröder', 'creators': ['Axel Schröder'],
'description': 'md5:eda9d1af34e5912efef5baf54fba4427', 'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
'release_date': '20200921', 'release_date': '20200921',
'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager', 'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'], 'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94', 'thumbnail': r're:https://www\.bpb\.de/cache/images/3/315813_teaser_16x9_1240\.png.*',
'title': 'Folge 1: Eine Einführung', 'title': 'Folge 1: Eine Einführung',
'uploader': 'Bundeszentrale für politische Bildung', 'uploader': 'Bundeszentrale für politische Bildung',
}, },
@ -98,12 +85,12 @@ class BpbIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '517806', 'id': '517806',
'ext': 'mp3', 'ext': 'mp3',
'creator': 'Bundeszentrale für politische Bildung', 'creators': ['Bundeszentrale für politische Bildung'],
'description': 'md5:594689600e919912aade0b2871cc3fed', 'description': 'md5:594689600e919912aade0b2871cc3fed',
'release_date': '20230127', 'release_date': '20230127',
'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"', 'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'], 'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0', 'thumbnail': r're:https://www\.bpb\.de/cache/images/6/517806_teaser_16x9_1240\.png.*',
'title': 'Die Weltanschauung der "Neuen Rechten"', 'title': 'Die Weltanschauung der "Neuen Rechten"',
'uploader': 'Bundeszentrale für politische Bildung', 'uploader': 'Bundeszentrale für politische Bildung',
}, },
@ -147,7 +134,7 @@ class BpbIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match})) title_result = traverse_obj(webpage, ({find_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False)) json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
return { return {
@ -156,10 +143,10 @@ class BpbIE(InfoExtractor):
# This metadata could be interpreted otherwise, but it fits "series" the most # This metadata could be interpreted otherwise, but it fits "series" the most
'series': traverse_obj(title_result, ('series', {str.strip})) or None, 'series': traverse_obj(title_result, ('series', {str.strip})) or None,
'description': join_nonempty(*traverse_obj(webpage, [( 'description': join_nonempty(*traverse_obj(webpage, [(
{html_get_element(cls='opening-intro')}, {find_element(cls='opening-intro')},
[{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}], [{find_element(tag='bpb-accordion-item')}, {find_element(cls='text-content')}],
), {clean_html}]), delim='\n\n') or None, ), {clean_html}]), delim='\n\n') or None,
'creator': self._html_search_meta('author', webpage), 'creators': traverse_obj(self._html_search_meta('author', webpage), all),
'uploader': self._html_search_meta('publisher', webpage), 'uploader': self._html_search_meta('publisher', webpage),
'release_date': unified_strdate(self._html_search_meta('date', webpage)), 'release_date': unified_strdate(self._html_search_meta('date', webpage)),
'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)), 'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),

View File

@ -8,11 +8,13 @@ from ..utils import (
bug_reports_message, bug_reports_message,
clean_html, clean_html,
format_field, format_field,
get_element_text_and_html_by_tag,
int_or_none, int_or_none,
url_or_none, url_or_none,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import (
find_element,
traverse_obj,
)
class BundestagIE(InfoExtractor): class BundestagIE(InfoExtractor):
@ -115,9 +117,8 @@ class BundestagIE(InfoExtractor):
note='Downloading metadata overlay', fatal=False, note='Downloading metadata overlay', fatal=False,
), { ), {
'title': ( 'title': (
{functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0, {find_element(tag='h3')}, {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
{functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}), 'description': ({find_element(tag='p')}, {clean_html}),
'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
})) }))
return result return result

View File

@ -6,13 +6,11 @@ from ..utils import (
ExtractorError, ExtractorError,
clean_html, clean_html,
extract_attributes, extract_attributes,
get_element_by_class,
get_element_html_by_id,
join_nonempty, join_nonempty,
parse_duration, parse_duration,
unified_timestamp, unified_timestamp,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import find_element, traverse_obj
class LearningOnScreenIE(InfoExtractor): class LearningOnScreenIE(InfoExtractor):
@ -32,28 +30,24 @@ class LearningOnScreenIE(InfoExtractor):
def _real_initialize(self): def _real_initialize(self):
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'): if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
self.raise_login_required( self.raise_login_required(method='session_cookies')
'Use --cookies for authentication. See '
' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp '
'for how to manually pass cookies', method=None)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
details = traverse_obj(webpage, ( details = traverse_obj(webpage, (
{functools.partial(get_element_html_by_id, 'programme-details')}, { {find_element(id='programme-details', html=True)}, {
'title': ({functools.partial(re.search, r'<h2>([^<]+)</h2>')}, 1, {clean_html}), 'title': ({find_element(tag='h2')}, {clean_html}),
'timestamp': ( 'timestamp': (
{functools.partial(get_element_by_class, 'broadcast-date')}, {find_element(cls='broadcast-date')},
{functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}), {functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}),
'duration': ( 'duration': (
{functools.partial(get_element_by_class, 'prog-running-time')}, {find_element(cls='prog-running-time')}, {clean_html}, {parse_duration}),
{clean_html}, {parse_duration}),
})) }))
title = details.pop('title', None) or traverse_obj(webpage, ( title = details.pop('title', None) or traverse_obj(webpage, (
{functools.partial(get_element_html_by_id, 'add-to-existing-playlist')}, {find_element(id='add-to-existing-playlist', html=True)},
{extract_attributes}, 'data-record-title', {clean_html})) {extract_attributes}, 'data-record-title', {clean_html}))
entries = self._parse_html5_media_entries( entries = self._parse_html5_media_entries(

View File

@ -6,12 +6,10 @@ from ..utils import (
extract_attributes, extract_attributes,
get_element_by_class, get_element_by_class,
get_element_html_by_id, get_element_html_by_id,
get_element_text_and_html_by_tag,
parse_duration, parse_duration,
strip_or_none, strip_or_none,
traverse_obj,
try_call,
) )
from ..utils.traversal import find_element, traverse_obj
class ListenNotesIE(InfoExtractor): class ListenNotesIE(InfoExtractor):
@ -22,14 +20,14 @@ class ListenNotesIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'KrDgvNb_u1n', 'id': 'KrDgvNb_u1n',
'ext': 'mp3', 'ext': 'mp3',
'title': 'md5:32236591a921adf17bbdbf0441b6c0e9', 'title': r're:Tim OReilly on noticing things other people .{113}',
'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd', 'description': r're:(?s)We shape reality by what we notice and .{27459}',
'duration': 2148.0, 'duration': 2215.0,
'channel': 'Thriving on Overload', 'channel': 'Amplifying Cognition',
'channel_id': 'ed84wITivxF', 'channel_id': 'ed84wITivxF',
'episode_id': 'e1312583fa7b4e24acfbb5131050be00', 'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg', 'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/amplifying-cognition-ross-dawson-Iemft4Gdr0k-ed84wITivxF.300x300.jpg',
'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/', 'channel_url': 'https://www.listennotes.com/podcasts/amplifying-cognition-ross-dawson-ed84wITivxF/',
'cast': ['Tim OReilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'], 'cast': ['Tim OReilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
}, },
}, { }, {
@ -39,13 +37,13 @@ class ListenNotesIE(InfoExtractor):
'id': 'lwEA3154JzG', 'id': 'lwEA3154JzG',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Episode 177: WireGuard with Jason Donenfeld', 'title': 'Episode 177: WireGuard with Jason Donenfeld',
'description': 'md5:24744f36456a3e95f83c1193a3458594', 'description': r're:(?s)Jason Donenfeld lead developer joins us this hour to discuss WireGuard, .{3169}',
'duration': 3861.0, 'duration': 3861.0,
'channel': 'Ask Noah Show', 'channel': 'Ask Noah Show',
'channel_id': '4DQTzdS5-j7', 'channel_id': '4DQTzdS5-j7',
'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4', 'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/', 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg', 'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-gD7vG150cxf-4DQTzdS5-j7.300x300.jpg',
'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'], 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
}, },
}] }]
@ -70,7 +68,7 @@ class ListenNotesIE(InfoExtractor):
'id': audio_id, 'id': audio_id,
'url': data['audio'], 'url': data['audio'],
'title': (data.get('data-title') 'title': (data.get('data-title')
or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0]) or traverse_obj(webpage, ({find_element(tag='h1')}, {clean_html}))
or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')), or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
'description': (self._clean_description(get_element_by_class('ln-text-p', webpage)) 'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
or strip_or_none(description)), or strip_or_none(description)),

View File

@ -4,15 +4,11 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
clean_html, clean_html,
extract_attributes, extract_attributes,
get_element_by_class,
get_element_html_by_class,
get_element_text_and_html_by_tag,
int_or_none, int_or_none,
strip_or_none, strip_or_none,
traverse_obj,
try_call,
unified_strdate, unified_strdate,
) )
from ..utils.traversal import find_element, traverse_obj
class MonstercatIE(InfoExtractor): class MonstercatIE(InfoExtractor):
@ -26,19 +22,21 @@ class MonstercatIE(InfoExtractor):
'thumbnail': 'https://www.monstercat.com/release/742779548009/cover', 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
'release_date': '20230711', 'release_date': '20230711',
'album': 'The Secret Language of Trees', 'album': 'The Secret Language of Trees',
'album_artist': 'BT', 'album_artists': ['BT'],
}, },
}] }]
def _extract_tracks(self, table, album_meta): def _extract_tracks(self, table, album_meta):
for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag
title = clean_html(try_call( title = traverse_obj(td, (
lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0])) {find_element(cls='d-inline-flex flex-column')},
ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '') {lambda x: x.partition(' <span')}, 0, {clean_html}))
ids = traverse_obj(td, (
{find_element(cls='btn-play cursor-pointer mr-small', html=True)}, {extract_attributes})) or {}
track_id = ids.get('data-track-id') track_id = ids.get('data-track-id')
release_id = ids.get('data-release-id') release_id = ids.get('data-release-id')
track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td))) track_number = traverse_obj(td, ({find_element(cls='py-xsmall')}, {int_or_none}))
if not track_id or not release_id: if not track_id or not release_id:
self.report_warning(f'Skipping track {track_number}, ID(s) not found') self.report_warning(f'Skipping track {track_number}, ID(s) not found')
self.write_debug(f'release_id={release_id!r} track_id={track_id!r}') self.write_debug(f'release_id={release_id!r} track_id={track_id!r}')
@ -48,7 +46,7 @@ class MonstercatIE(InfoExtractor):
'title': title, 'title': title,
'track': title, 'track': title,
'track_number': track_number, 'track_number': track_number,
'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))), 'artists': traverse_obj(td, ({find_element(cls='d-block fs-xxsmall')}, {clean_html}, all)),
'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}', 'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
'id': track_id, 'id': track_id,
'ext': 'mp3', 'ext': 'mp3',
@ -57,20 +55,19 @@ class MonstercatIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
url_id = self._match_id(url) url_id = self._match_id(url)
html = self._download_webpage(url, url_id) html = self._download_webpage(url, url_id)
# wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html # NB: HTMLParser may choke on this html; use {find_element} or try_call(lambda: get_element...)
tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or '' tracklist_table = traverse_obj(html, {find_element(cls='table table-small')}) or ''
title = traverse_obj(html, ({find_element(tag='h1')}, {clean_html}))
title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0])
date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block',
html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate}))
album_meta = { album_meta = {
'title': title, 'title': title,
'album': title, 'album': title,
'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover', 'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
'album_artist': try_call( 'album_artists': traverse_obj(html, (
lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)), {find_element(cls='h-normal text-uppercase mb-desktop-medium mb-smallish')}, {clean_html}, all)),
'release_date': date, 'release_date': traverse_obj(html, (
{find_element(cls='font-italic mb-medium d-tablet-none d-phone-block')},
{lambda x: x.partition('Released ')}, 2, {strip_or_none}, {unified_strdate})),
} }
return self.playlist_result( return self.playlist_result(

View File

@ -6,12 +6,10 @@ from ..utils import (
determine_ext, determine_ext,
extract_attributes, extract_attributes,
get_element_by_class, get_element_by_class,
get_element_text_and_html_by_tag,
parse_duration, parse_duration,
traverse_obj,
try_call,
url_or_none, url_or_none,
) )
from ..utils.traversal import find_element, traverse_obj
class NekoHackerIE(InfoExtractor): class NekoHackerIE(InfoExtractor):
@ -35,7 +33,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20221101', 'release_date': '20221101',
'album': 'Nekoverse', 'album': 'Nekoverse',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': 'Spaceship', 'track': 'Spaceship',
'track_number': 1, 'track_number': 1,
'duration': 195.0, 'duration': 195.0,
@ -53,7 +51,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20221101', 'release_date': '20221101',
'album': 'Nekoverse', 'album': 'Nekoverse',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': 'City Runner', 'track': 'City Runner',
'track_number': 2, 'track_number': 2,
'duration': 148.0, 'duration': 148.0,
@ -71,7 +69,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20221101', 'release_date': '20221101',
'album': 'Nekoverse', 'album': 'Nekoverse',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': 'Nature Talk', 'track': 'Nature Talk',
'track_number': 3, 'track_number': 3,
'duration': 174.0, 'duration': 174.0,
@ -89,7 +87,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20221101', 'release_date': '20221101',
'album': 'Nekoverse', 'album': 'Nekoverse',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': 'Crystal World', 'track': 'Crystal World',
'track_number': 4, 'track_number': 4,
'duration': 199.0, 'duration': 199.0,
@ -115,7 +113,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20210115', 'release_date': '20210115',
'album': '進め!むじなカンパニー', 'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0', 'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0',
'track_number': 1, 'track_number': 1,
}, },
@ -132,7 +130,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20210115', 'release_date': '20210115',
'album': '進め!むじなカンパニー', 'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )', 'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )',
'track_number': 2, 'track_number': 2,
}, },
@ -149,7 +147,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20210115', 'release_date': '20210115',
'album': '進め!むじなカンパニー', 'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': '進め!むじなカンパニー (instrumental)', 'track': '進め!むじなカンパニー (instrumental)',
'track_number': 3, 'track_number': 3,
}, },
@ -166,7 +164,7 @@ class NekoHackerIE(InfoExtractor):
'acodec': 'mp3', 'acodec': 'mp3',
'release_date': '20210115', 'release_date': '20210115',
'album': '進め!むじなカンパニー', 'album': '進め!むじなカンパニー',
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'track': 'むじな de なじむ (instrumental)', 'track': 'むじな de なじむ (instrumental)',
'track_number': 4, 'track_number': 4,
}, },
@ -181,14 +179,17 @@ class NekoHackerIE(InfoExtractor):
playlist = get_element_by_class('playlist', webpage) playlist = get_element_by_class('playlist', webpage)
if not playlist: if not playlist:
iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or '' iframe_src = traverse_obj(webpage, (
iframe_src = url_or_none(extract_attributes(iframe).get('src')) {find_element(tag='iframe', html=True)}, {extract_attributes}, 'src', {url_or_none}))
if not iframe_src: if not iframe_src:
raise ExtractorError('No playlist or embed found in webpage') raise ExtractorError('No playlist or embed found in webpage')
elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src): elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src):
raise ExtractorError('Spotify embeds are not supported', expected=True) raise ExtractorError('Spotify embeds are not supported', expected=True)
return self.url_result(url, 'Generic') return self.url_result(url, 'Generic')
player_params = self._search_json(
r'var srp_player_params_[\da-f]+\s*=', webpage, 'player params', playlist_id, default={})
entries = [] entries = []
for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1): for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1):
entry = traverse_obj(extract_attributes(track), { entry = traverse_obj(extract_attributes(track), {
@ -200,12 +201,12 @@ class NekoHackerIE(InfoExtractor):
'album': 'data-albumtitle', 'album': 'data-albumtitle',
'duration': ('data-tracktime', {parse_duration}), 'duration': ('data-tracktime', {parse_duration}),
'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0), 'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0),
'thumbnail': ('data-albumart', {url_or_none}),
}) })
entries.append({ entries.append({
**entry, **entry,
'thumbnail': url_or_none(player_params.get('artwork')),
'track_number': track_number, 'track_number': track_number,
'artist': 'Neko Hacker', 'artists': ['Neko Hacker'],
'vcodec': 'none', 'vcodec': 'none',
'acodec': 'mp3' if entry['ext'] == 'mp3' else None, 'acodec': 'mp3' if entry['ext'] == 'mp3' else None,
}) })

View File

@ -10,10 +10,10 @@ from ..utils import (
get_element_html_by_class, get_element_html_by_class,
get_elements_by_class, get_elements_by_class,
int_or_none, int_or_none,
try_call,
unified_timestamp, unified_timestamp,
urlencode_postdata, urlencode_postdata,
) )
from ..utils.traversal import find_element, find_elements, traverse_obj
class NubilesPornIE(InfoExtractor): class NubilesPornIE(InfoExtractor):
@ -70,9 +70,8 @@ class NubilesPornIE(InfoExtractor):
url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0] url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0]
channel_id, channel_name = self._search_regex( channel_id, channel_name = self._search_regex(
r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page), r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page) or '',
'channel', fatal=False, group=('id', 'name')) or (None, None) 'channel', fatal=False, group=('id', 'name')) or (None, None)
channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name)
return { return {
'id': video_id, 'id': video_id,
@ -82,14 +81,14 @@ class NubilesPornIE(InfoExtractor):
'thumbnail': media_entries.get('thumbnail'), 'thumbnail': media_entries.get('thumbnail'),
'description': clean_html(get_element_html_by_class('content-pane-description', page)), 'description': clean_html(get_element_html_by_class('content-pane-description', page)),
'timestamp': unified_timestamp(get_element_by_class('date', page)), 'timestamp': unified_timestamp(get_element_by_class('date', page)),
'channel': channel_name, 'channel': re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) if channel_name else None,
'channel_id': channel_id, 'channel_id': channel_id,
'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'), 'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'),
'like_count': int_or_none(get_element_by_id('likecount', page)), 'like_count': int_or_none(get_element_by_id('likecount', page)),
'average_rating': float_or_none(get_element_by_class('score', page)), 'average_rating': float_or_none(get_element_by_class('score', page)),
'age_limit': 18, 'age_limit': 18,
'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))), 'categories': traverse_obj(page, ({find_element(cls='categories')}, {find_elements(cls='btn')}, ..., {clean_html})),
'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))), 'tags': traverse_obj(page, ({find_elements(cls='tags')}, 1, {find_elements(cls='btn')}, ..., {clean_html})),
'cast': get_elements_by_class('content-pane-performer', page), 'cast': get_elements_by_class('content-pane-performer', page),
'availability': 'needs_auth', 'availability': 'needs_auth',
'series': channel_name, 'series': channel_name,

View File

@ -3,14 +3,12 @@ from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
clean_html, clean_html,
get_element_text_and_html_by_tag,
int_or_none, int_or_none,
str_or_none, str_or_none,
traverse_obj,
try_call,
unified_timestamp, unified_timestamp,
urljoin, urljoin,
) )
from ..utils.traversal import find_element, traverse_obj
class TBSJPEpisodeIE(InfoExtractor): class TBSJPEpisodeIE(InfoExtractor):
@ -64,7 +62,7 @@ class TBSJPEpisodeIE(InfoExtractor):
self._merge_subtitles(subs, target=subtitles) self._merge_subtitles(subs, target=subtitles)
return { return {
'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])), 'title': traverse_obj(webpage, ({find_element(tag='h3')}, {clean_html})),
'id': video_id, 'id': video_id,
**traverse_obj(episode, { **traverse_obj(episode, {
'categories': ('keywords', {list}), 'categories': ('keywords', {list}),

View File

@ -14,6 +14,7 @@ def _fmt_url(url):
class TelewebionIE(InfoExtractor): class TelewebionIE(InfoExtractor):
_WORKING = False
_VALID_URL = r'https?://(?:www\.)?telewebion\.com/episode/(?P<id>(?:0x[a-fA-F\d]+|\d+))' _VALID_URL = r'https?://(?:www\.)?telewebion\.com/episode/(?P<id>(?:0x[a-fA-F\d]+|\d+))'
_TESTS = [{ _TESTS = [{
'url': 'http://www.telewebion.com/episode/0x1b3139c/', 'url': 'http://www.telewebion.com/episode/0x1b3139c/',

View File

@ -5142,6 +5142,7 @@ class _UnsafeExtensionError(Exception):
'rm', 'rm',
'swf', 'swf',
'ts', 'ts',
'vid',
'vob', 'vob',
'vp9', 'vp9',
@ -5174,6 +5175,7 @@ class _UnsafeExtensionError(Exception):
'heic', 'heic',
'ico', 'ico',
'image', 'image',
'jfif',
'jng', 'jng',
'jpe', 'jpe',
'jpeg', 'jpeg',

View File

@ -20,6 +20,7 @@ from ._utils import (
get_elements_html_by_class, get_elements_html_by_class,
get_elements_html_by_attribute, get_elements_html_by_attribute,
get_elements_by_attribute, get_elements_by_attribute,
get_element_by_class,
get_element_html_by_attribute, get_element_html_by_attribute,
get_element_by_attribute, get_element_by_attribute,
get_element_html_by_id, get_element_html_by_id,
@ -373,7 +374,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
@typing.overload @typing.overload
def find_element(*, attr: str, value: str, tag: str | None = None, html=False): ... def find_element(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ...
@typing.overload @typing.overload
@ -381,14 +382,14 @@ def find_element(*, cls: str, html=False): ...
@typing.overload @typing.overload
def find_element(*, id: str, tag: str | None = None, html=False): ... def find_element(*, id: str, tag: str | None = None, html=False, regex=False): ...
@typing.overload @typing.overload
def find_element(*, tag: str, html=False): ... def find_element(*, tag: str, html=False, regex=False): ...
def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False): def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False, regex=False):
# deliberately using `id=` and `cls=` for ease of readability # deliberately using `id=` and `cls=` for ease of readability
assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required' assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required'
ANY_TAG = r'[\w:.-]+' ANY_TAG = r'[\w:.-]+'
@ -397,17 +398,18 @@ def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=Fal
assert not cls, 'Cannot match both attr and cls' assert not cls, 'Cannot match both attr and cls'
assert not id, 'Cannot match both attr and id' assert not id, 'Cannot match both attr and id'
func = get_element_html_by_attribute if html else get_element_by_attribute func = get_element_html_by_attribute if html else get_element_by_attribute
return functools.partial(func, attr, value, tag=tag or ANY_TAG) return functools.partial(func, attr, value, tag=tag or ANY_TAG, escape_value=not regex)
elif cls: elif cls:
assert not id, 'Cannot match both cls and id' assert not id, 'Cannot match both cls and id'
assert tag is None, 'Cannot match both cls and tag' assert tag is None, 'Cannot match both cls and tag'
func = get_element_html_by_class if html else get_elements_by_class assert not regex, 'Cannot use regex with cls'
func = get_element_html_by_class if html else get_element_by_class
return functools.partial(func, cls) return functools.partial(func, cls)
elif id: elif id:
func = get_element_html_by_id if html else get_element_by_id func = get_element_html_by_id if html else get_element_by_id
return functools.partial(func, id, tag=tag or ANY_TAG) return functools.partial(func, id, tag=tag or ANY_TAG, escape_value=not regex)
index = int(bool(html)) index = int(bool(html))
return lambda html: get_element_text_and_html_by_tag(tag, html)[index] return lambda html: get_element_text_and_html_by_tag(tag, html)[index]
@ -418,19 +420,20 @@ def find_elements(*, cls: str, html=False): ...
@typing.overload @typing.overload
def find_elements(*, attr: str, value: str, tag: str | None = None, html=False): ... def find_elements(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ...
def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False): def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False, regex=False):
# deliberately using `cls=` for ease of readability # deliberately using `cls=` for ease of readability
assert cls or (attr and value), 'One of cls or (attr AND value) is required' assert cls or (attr and value), 'One of cls or (attr AND value) is required'
if attr and value: if attr and value:
assert not cls, 'Cannot match both attr and cls' assert not cls, 'Cannot match both attr and cls'
func = get_elements_html_by_attribute if html else get_elements_by_attribute func = get_elements_html_by_attribute if html else get_elements_by_attribute
return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+') return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+', escape_value=not regex)
assert not tag, 'Cannot match both cls and tag' assert not tag, 'Cannot match both cls and tag'
assert not regex, 'Cannot use regex with cls'
func = get_elements_html_by_class if html else get_elements_by_class func = get_elements_html_by_class if html else get_elements_by_class
return functools.partial(func, cls) return functools.partial(func, cls)