[ie/learningonscreen] raise login required with new session_cookies method

Authored by: bashonly
another one
2024-11-28 02:01:25 +01:00 · 2024-11-03 13:28:06 -06:00 · 2024-11-03 13:25:17 -06:00 · 2024-11-03 20:23:35 +01:00 · 2024-11-03 20:12:50 +01:00 · 2024-11-03 13:05:15 -06:00
13 changed files with 160 additions and 127 deletions
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
@ -13,6 +13,8 @@ from yt_dlp.utils import (
    str_or_none,
 )
 from yt_dlp.utils.traversal import (
+    find_element,
+    find_elements,
    require,
    subs_list_to_dict,
    traverse_obj,
@ -37,6 +39,14 @@ _TEST_DATA = {
    'dict': {},
 }

+_TEST_HTML = '''<html><body>
+    <div class="a">1</div>
+    <div class="a" id="x" custom="z">2</div>
+    <div class="b" data-id="y" custom="z">3</div>
+    <p class="a">4</p>
+    <p id="d" custom="e">5</p>
+</body></html>'''
+

 class TestTraversal:
    def test_traversal_base(self):
@ -521,6 +531,50 @@ class TestTraversalHelpers:
        with pytest.raises(TypeError):
            unpack()

+    def test_find_element(self):
+        for improper_kwargs in [
+            dict(attr='data-id'),
+            dict(value='y'),
+            dict(attr='data-id', value='y', cls='a'),
+            dict(attr='data-id', value='y', id='x'),
+            dict(cls='a', id='x'),
+            dict(cls='a', tag='p'),
+            dict(cls='[ab]', regex=True),
+        ]:
+            with pytest.raises(AssertionError):
+                find_element(**improper_kwargs)(_TEST_HTML)
+
+        assert find_element(cls='a')(_TEST_HTML) == '1'
+        assert find_element(cls='a', html=True)(_TEST_HTML) == '<div class="a">1</div>'
+        assert find_element(id='x')(_TEST_HTML) == '2'
+        assert find_element(id='[ex]')(_TEST_HTML) is None
+        assert find_element(id='[ex]', regex=True)(_TEST_HTML) == '2'
+        assert find_element(id='x', html=True)(_TEST_HTML) == '<div class="a" id="x" custom="z">2</div>'
+        assert find_element(attr='data-id', value='y')(_TEST_HTML) == '3'
+        assert find_element(attr='data-id', value='y(?:es)?')(_TEST_HTML) is None
+        assert find_element(attr='data-id', value='y(?:es)?', regex=True)(_TEST_HTML) == '3'
+        assert find_element(
+            attr='data-id', value='y', html=True)(_TEST_HTML) == '<div class="b" data-id="y" custom="z">3</div>'
+
+    def test_find_elements(self):
+        for improper_kwargs in [
+            dict(tag='p'),
+            dict(attr='data-id'),
+            dict(value='y'),
+            dict(attr='data-id', value='y', cls='a'),
+            dict(cls='a', tag='div'),
+            dict(cls='[ab]', regex=True),
+        ]:
+            with pytest.raises(AssertionError):
+                find_elements(**improper_kwargs)(_TEST_HTML)
+
+        assert find_elements(cls='a')(_TEST_HTML) == ['1', '2', '4']
+        assert find_elements(cls='a', html=True)(_TEST_HTML) == [
+            '<div class="a">1</div>', '<div class="a" id="x" custom="z">2</div>', '<p class="a">4</p>']
+        assert find_elements(attr='custom', value='z')(_TEST_HTML) == ['2', '3']
+        assert find_elements(attr='custom', value='[ez]')(_TEST_HTML) == []
+        assert find_elements(attr='custom', value='[ez]', regex=True)(_TEST_HTML) == ['2', '3', '5']
+

 class TestDictGet:
    def test_dict_get(self):
--- a/yt_dlp/extractor/bandcamp.py
+++ b/yt_dlp/extractor/bandcamp.py
@ -1,4 +1,3 @@
-import functools
 import json
 import random
 import re
@ -10,7 +9,6 @@ from ..utils import (
    ExtractorError,
    extract_attributes,
    float_or_none,
-    get_element_html_by_id,
    int_or_none,
    parse_filesize,
    str_or_none,
@ -21,7 +19,7 @@ from ..utils import (
    url_or_none,
    urljoin,
 )
-from ..utils.traversal import traverse_obj
+from ..utils.traversal import find_element, traverse_obj


 class BandcampIE(InfoExtractor):
@ -511,7 +509,7 @@ class BandcampUserIE(InfoExtractor):
            or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))

        yield from traverse_obj(webpage, (
-            {functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes},
+            {find_element(id='music-grid', html=True)}, {extract_attributes},
            'data-client-items', {json.loads}, ..., 'page_url', {str}))

    def _real_extract(self, url):
--- a/yt_dlp/extractor/bpb.py
+++ b/yt_dlp/extractor/bpb.py
@ -1,35 +1,20 @@
-import functools
 import re

 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    extract_attributes,
-    get_element_text_and_html_by_tag,
-    get_elements_by_class,
    join_nonempty,
    js_to_json,
    mimetype2ext,
    unified_strdate,
    url_or_none,
    urljoin,
-    variadic,
 )
-from ..utils.traversal import traverse_obj
-
-
-def html_get_element(tag=None, cls=None):
-    assert tag or cls, 'One of tag or class is required'
-
-    if cls:
-        func = functools.partial(get_elements_by_class, cls, tag=tag)
-    else:
-        func = functools.partial(get_element_text_and_html_by_tag, tag)
-
-    def html_get_element_wrapper(html):
-        return variadic(func(html))[0]
-
-    return html_get_element_wrapper
+from ..utils.traversal import (
+    find_element,
+    traverse_obj,
+)


 class BpbIE(InfoExtractor):
@ -41,12 +26,12 @@ class BpbIE(InfoExtractor):
        'info_dict': {
            'id': '297',
            'ext': 'mp4',
-            'creator': 'Kooperative Berlin',
-            'description': 'md5:f4f75885ba009d3e2b156247a8941ce6',
-            'release_date': '20160115',
+            'creators': ['Kooperative Berlin'],
+            'description': r're:Joachim Gauck, .*\n\nKamera: .*',
+            'release_date': '20150716',
            'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
-            'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'],
-            'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D',
+            'tags': [],
+            'thumbnail': r're:https?://www\.bpb\.de/cache/images/7/297_teaser_16x9_1240\.jpg.*',
            'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
            'uploader': 'Bundeszentrale für politische Bildung',
        },
@ -55,11 +40,12 @@ class BpbIE(InfoExtractor):
        'info_dict': {
            'id': '522184',
            'ext': 'mp4',
-            'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
+            'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
            'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
            'release_date': '20230621',
-            'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
-            'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB',
+            'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
+            'tags': [],
+            'thumbnail': r're:https://www\.bpb\.de/cache/images/4/522184_teaser_16x9_1240\.png.*',
            'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
            'uploader': 'Bundeszentrale für politische Bildung',
        },
@ -68,11 +54,12 @@ class BpbIE(InfoExtractor):
        'info_dict': {
            'id': '518789',
            'ext': 'mp4',
-            'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
+            'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
            'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
            'release_date': '20230302',
-            'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
-            'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D',
+            'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
+            'tags': [],
+            'thumbnail': r're:https://www\.bpb\.de/cache/images/9/518789_teaser_16x9_1240\.jpeg.*',
            'title': 'md5:3e956f264bb501f6383f10495a401da4',
            'uploader': 'Bundeszentrale für politische Bildung',
        },
@ -84,12 +71,12 @@ class BpbIE(InfoExtractor):
        'info_dict': {
            'id': '315813',
            'ext': 'mp3',
-            'creator': 'Axel Schröder',
+            'creators': ['Axel Schröder'],
            'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
            'release_date': '20200921',
            'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
            'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
-            'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94',
+            'thumbnail': r're:https://www\.bpb\.de/cache/images/3/315813_teaser_16x9_1240\.png.*',
            'title': 'Folge 1: Eine Einführung',
            'uploader': 'Bundeszentrale für politische Bildung',
        },
@ -98,12 +85,12 @@ class BpbIE(InfoExtractor):
        'info_dict': {
            'id': '517806',
            'ext': 'mp3',
-            'creator': 'Bundeszentrale für politische Bildung',
+            'creators': ['Bundeszentrale für politische Bildung'],
            'description': 'md5:594689600e919912aade0b2871cc3fed',
            'release_date': '20230127',
            'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
            'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
-            'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0',
+            'thumbnail': r're:https://www\.bpb\.de/cache/images/6/517806_teaser_16x9_1240\.png.*',
            'title': 'Die Weltanschauung der "Neuen Rechten"',
            'uploader': 'Bundeszentrale für politische Bildung',
        },
@ -147,7 +134,7 @@ class BpbIE(InfoExtractor):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

-        title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
+        title_result = traverse_obj(webpage, ({find_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
        json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))

        return {
@ -156,10 +143,10 @@ class BpbIE(InfoExtractor):
            # This metadata could be interpreted otherwise, but it fits "series" the most
            'series': traverse_obj(title_result, ('series', {str.strip})) or None,
            'description': join_nonempty(*traverse_obj(webpage, [(
-                {html_get_element(cls='opening-intro')},
-                [{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}],
+                {find_element(cls='opening-intro')},
+                [{find_element(tag='bpb-accordion-item')}, {find_element(cls='text-content')}],
            ), {clean_html}]), delim='\n\n') or None,
-            'creator': self._html_search_meta('author', webpage),
+            'creators': traverse_obj(self._html_search_meta('author', webpage), all),
            'uploader': self._html_search_meta('publisher', webpage),
            'release_date': unified_strdate(self._html_search_meta('date', webpage)),
            'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
--- a/yt_dlp/extractor/bundestag.py
+++ b/yt_dlp/extractor/bundestag.py
@ -8,11 +8,13 @@ from ..utils import (
    bug_reports_message,
    clean_html,
    format_field,
-    get_element_text_and_html_by_tag,
    int_or_none,
    url_or_none,
 )
-from ..utils.traversal import traverse_obj
+from ..utils.traversal import (
+    find_element,
+    traverse_obj,
+)


 class BundestagIE(InfoExtractor):
@ -115,9 +117,8 @@ class BundestagIE(InfoExtractor):
            note='Downloading metadata overlay', fatal=False,
        ), {
            'title': (
-                {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
-                {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
-            'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
+                {find_element(tag='h3')}, {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
+            'description': ({find_element(tag='p')}, {clean_html}),
        }))

        return result
--- a/yt_dlp/extractor/learningonscreen.py
+++ b/yt_dlp/extractor/learningonscreen.py
@ -6,13 +6,11 @@ from ..utils import (
    ExtractorError,
    clean_html,
    extract_attributes,
-    get_element_by_class,
-    get_element_html_by_id,
    join_nonempty,
    parse_duration,
    unified_timestamp,
 )
-from ..utils.traversal import traverse_obj
+from ..utils.traversal import find_element, traverse_obj


 class LearningOnScreenIE(InfoExtractor):
@ -32,28 +30,24 @@ class LearningOnScreenIE(InfoExtractor):

    def _real_initialize(self):
        if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
-            self.raise_login_required(
-                'Use --cookies for authentication. See '
-                ' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  '
-                'for how to manually pass cookies', method=None)
+            self.raise_login_required(method='session_cookies')

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        details = traverse_obj(webpage, (
-            {functools.partial(get_element_html_by_id, 'programme-details')}, {
-                'title': ({functools.partial(re.search, r'<h2>([^<]+)</h2>')}, 1, {clean_html}),
+            {find_element(id='programme-details', html=True)}, {
+                'title': ({find_element(tag='h2')}, {clean_html}),
                'timestamp': (
-                    {functools.partial(get_element_by_class, 'broadcast-date')},
+                    {find_element(cls='broadcast-date')},
                    {functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}),
                'duration': (
-                    {functools.partial(get_element_by_class, 'prog-running-time')},
-                    {clean_html}, {parse_duration}),
+                    {find_element(cls='prog-running-time')}, {clean_html}, {parse_duration}),
            }))

        title = details.pop('title', None) or traverse_obj(webpage, (
-            {functools.partial(get_element_html_by_id, 'add-to-existing-playlist')},
+            {find_element(id='add-to-existing-playlist', html=True)},
            {extract_attributes}, 'data-record-title', {clean_html}))

        entries = self._parse_html5_media_entries(
--- a/yt_dlp/extractor/listennotes.py
+++ b/yt_dlp/extractor/listennotes.py
@ -6,12 +6,10 @@ from ..utils import (
    extract_attributes,
    get_element_by_class,
    get_element_html_by_id,
-    get_element_text_and_html_by_tag,
    parse_duration,
    strip_or_none,
-    traverse_obj,
-    try_call,
 )
+from ..utils.traversal import find_element, traverse_obj


 class ListenNotesIE(InfoExtractor):
@ -22,14 +20,14 @@ class ListenNotesIE(InfoExtractor):
        'info_dict': {
            'id': 'KrDgvNb_u1n',
            'ext': 'mp3',
-            'title': 'md5:32236591a921adf17bbdbf0441b6c0e9',
-            'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd',
-            'duration': 2148.0,
-            'channel': 'Thriving on Overload',
+            'title': r're:Tim O’Reilly on noticing things other people .{113}',
+            'description': r're:(?s)‘’We shape reality by what we notice and .{27459}',
+            'duration': 2215.0,
+            'channel': 'Amplifying Cognition',
            'channel_id': 'ed84wITivxF',
            'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
-            'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg',
-            'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/',
+            'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/amplifying-cognition-ross-dawson-Iemft4Gdr0k-ed84wITivxF.300x300.jpg',
+            'channel_url': 'https://www.listennotes.com/podcasts/amplifying-cognition-ross-dawson-ed84wITivxF/',
            'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
        },
    }, {
@ -39,13 +37,13 @@ class ListenNotesIE(InfoExtractor):
            'id': 'lwEA3154JzG',
            'ext': 'mp3',
            'title': 'Episode 177: WireGuard with Jason Donenfeld',
-            'description': 'md5:24744f36456a3e95f83c1193a3458594',
+            'description': r're:(?s)Jason Donenfeld lead developer joins us this hour to discuss WireGuard, .{3169}',
            'duration': 3861.0,
            'channel': 'Ask Noah Show',
            'channel_id': '4DQTzdS5-j7',
            'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
            'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
-            'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg',
+            'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-gD7vG150cxf-4DQTzdS5-j7.300x300.jpg',
            'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
        },
    }]
@ -70,7 +68,7 @@ class ListenNotesIE(InfoExtractor):
            'id': audio_id,
            'url': data['audio'],
            'title': (data.get('data-title')
-                      or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
+                      or traverse_obj(webpage, ({find_element(tag='h1')}, {clean_html}))
                      or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
            'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
                            or strip_or_none(description)),
--- a/yt_dlp/extractor/monstercat.py
+++ b/yt_dlp/extractor/monstercat.py
@ -4,15 +4,11 @@ from .common import InfoExtractor
 from ..utils import (
    clean_html,
    extract_attributes,
-    get_element_by_class,
-    get_element_html_by_class,
-    get_element_text_and_html_by_tag,
    int_or_none,
    strip_or_none,
-    traverse_obj,
-    try_call,
    unified_strdate,
 )
+from ..utils.traversal import find_element, traverse_obj


 class MonstercatIE(InfoExtractor):
@ -26,19 +22,21 @@ class MonstercatIE(InfoExtractor):
            'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
            'release_date': '20230711',
            'album': 'The Secret Language of Trees',
-            'album_artist': 'BT',
+            'album_artists': ['BT'],
        },
    }]

    def _extract_tracks(self, table, album_meta):
        for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table):  # regex by chatgpt due to lack of get_elements_by_tag
-            title = clean_html(try_call(
-                lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0]))
-            ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '')
+            title = traverse_obj(td, (
+                {find_element(cls='d-inline-flex flex-column')},
+                {lambda x: x.partition(' <span')}, 0, {clean_html}))
+            ids = traverse_obj(td, (
+                {find_element(cls='btn-play cursor-pointer mr-small', html=True)}, {extract_attributes})) or {}
            track_id = ids.get('data-track-id')
            release_id = ids.get('data-release-id')

-            track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td)))
+            track_number = traverse_obj(td, ({find_element(cls='py-xsmall')}, {int_or_none}))
            if not track_id or not release_id:
                self.report_warning(f'Skipping track {track_number}, ID(s) not found')
                self.write_debug(f'release_id={release_id!r} track_id={track_id!r}')
@ -48,7 +46,7 @@ class MonstercatIE(InfoExtractor):
                'title': title,
                'track': title,
                'track_number': track_number,
-                'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))),
+                'artists': traverse_obj(td, ({find_element(cls='d-block fs-xxsmall')}, {clean_html}, all)),
                'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
                'id': track_id,
                'ext': 'mp3',
@ -57,20 +55,19 @@ class MonstercatIE(InfoExtractor):
    def _real_extract(self, url):
        url_id = self._match_id(url)
        html = self._download_webpage(url, url_id)
-        # wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html
-        tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or ''
-
-        title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0])
-        date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block',
-                            html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate}))
+        # NB: HTMLParser may choke on this html; use {find_element} or try_call(lambda: get_element...)
+        tracklist_table = traverse_obj(html, {find_element(cls='table table-small')}) or ''
+        title = traverse_obj(html, ({find_element(tag='h1')}, {clean_html}))

        album_meta = {
            'title': title,
            'album': title,
            'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
-            'album_artist': try_call(
-                lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)),
-            'release_date': date,
+            'album_artists': traverse_obj(html, (
+                {find_element(cls='h-normal text-uppercase mb-desktop-medium mb-smallish')}, {clean_html}, all)),
+            'release_date': traverse_obj(html, (
+                {find_element(cls='font-italic mb-medium d-tablet-none d-phone-block')},
+                {lambda x: x.partition('Released ')}, 2, {strip_or_none}, {unified_strdate})),
        }

        return self.playlist_result(
--- a/yt_dlp/extractor/nekohacker.py
+++ b/yt_dlp/extractor/nekohacker.py
@ -6,12 +6,10 @@ from ..utils import (
    determine_ext,
    extract_attributes,
    get_element_by_class,
-    get_element_text_and_html_by_tag,
    parse_duration,
-    traverse_obj,
-    try_call,
    url_or_none,
 )
+from ..utils.traversal import find_element, traverse_obj


 class NekoHackerIE(InfoExtractor):
@ -35,7 +33,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20221101',
                    'album': 'Nekoverse',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': 'Spaceship',
                    'track_number': 1,
                    'duration': 195.0,
@ -53,7 +51,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20221101',
                    'album': 'Nekoverse',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': 'City Runner',
                    'track_number': 2,
                    'duration': 148.0,
@ -71,7 +69,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20221101',
                    'album': 'Nekoverse',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': 'Nature Talk',
                    'track_number': 3,
                    'duration': 174.0,
@ -89,7 +87,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20221101',
                    'album': 'Nekoverse',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': 'Crystal World',
                    'track_number': 4,
                    'duration': 199.0,
@ -115,7 +113,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20210115',
                    'album': '進め！むじなカンパニー',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0',
                    'track_number': 1,
                },
@ -132,7 +130,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20210115',
                    'album': '進め！むじなカンパニー',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )',
                    'track_number': 2,
                },
@ -149,7 +147,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20210115',
                    'album': '進め！むじなカンパニー',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': '進め！むじなカンパニー (instrumental)',
                    'track_number': 3,
                },
@ -166,7 +164,7 @@ class NekoHackerIE(InfoExtractor):
                    'acodec': 'mp3',
                    'release_date': '20210115',
                    'album': '進め！むじなカンパニー',
-                    'artist': 'Neko Hacker',
+                    'artists': ['Neko Hacker'],
                    'track': 'むじな de なじむ (instrumental)',
                    'track_number': 4,
                },
@ -181,14 +179,17 @@ class NekoHackerIE(InfoExtractor):
        playlist = get_element_by_class('playlist', webpage)

        if not playlist:
-            iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or ''
-            iframe_src = url_or_none(extract_attributes(iframe).get('src'))
+            iframe_src = traverse_obj(webpage, (
+                {find_element(tag='iframe', html=True)}, {extract_attributes}, 'src', {url_or_none}))
            if not iframe_src:
                raise ExtractorError('No playlist or embed found in webpage')
            elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src):
                raise ExtractorError('Spotify embeds are not supported', expected=True)
            return self.url_result(url, 'Generic')

+        player_params = self._search_json(
+            r'var srp_player_params_[\da-f]+\s*=', webpage, 'player params', playlist_id, default={})
+
        entries = []
        for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1):
            entry = traverse_obj(extract_attributes(track), {
@ -200,12 +201,12 @@ class NekoHackerIE(InfoExtractor):
                'album': 'data-albumtitle',
                'duration': ('data-tracktime', {parse_duration}),
                'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0),
-                'thumbnail': ('data-albumart', {url_or_none}),
            })
            entries.append({
                **entry,
+                'thumbnail': url_or_none(player_params.get('artwork')),
                'track_number': track_number,
-                'artist': 'Neko Hacker',
+                'artists': ['Neko Hacker'],
                'vcodec': 'none',
                'acodec': 'mp3' if entry['ext'] == 'mp3' else None,
            })
--- a/yt_dlp/extractor/nubilesporn.py
+++ b/yt_dlp/extractor/nubilesporn.py
@ -10,10 +10,10 @@ from ..utils import (
    get_element_html_by_class,
    get_elements_by_class,
    int_or_none,
-    try_call,
    unified_timestamp,
    urlencode_postdata,
 )
+from ..utils.traversal import find_element, find_elements, traverse_obj


 class NubilesPornIE(InfoExtractor):
@ -70,9 +70,8 @@ class NubilesPornIE(InfoExtractor):
            url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0]

        channel_id, channel_name = self._search_regex(
-            r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page),
+            r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page) or '',
            'channel', fatal=False, group=('id', 'name')) or (None, None)
-        channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name)

        return {
            'id': video_id,
@ -82,14 +81,14 @@ class NubilesPornIE(InfoExtractor):
            'thumbnail': media_entries.get('thumbnail'),
            'description': clean_html(get_element_html_by_class('content-pane-description', page)),
            'timestamp': unified_timestamp(get_element_by_class('date', page)),
-            'channel': channel_name,
+            'channel': re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) if channel_name else None,
            'channel_id': channel_id,
            'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'),
            'like_count': int_or_none(get_element_by_id('likecount', page)),
            'average_rating': float_or_none(get_element_by_class('score', page)),
            'age_limit': 18,
-            'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))),
-            'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))),
+            'categories': traverse_obj(page, ({find_element(cls='categories')}, {find_elements(cls='btn')}, ..., {clean_html})),
+            'tags': traverse_obj(page, ({find_elements(cls='tags')}, 1, {find_elements(cls='btn')}, ..., {clean_html})),
            'cast': get_elements_by_class('content-pane-performer', page),
            'availability': 'needs_auth',
            'series': channel_name,
--- a/yt_dlp/extractor/tbsjp.py
+++ b/yt_dlp/extractor/tbsjp.py
@ -3,14 +3,12 @@ from ..networking.exceptions import HTTPError
 from ..utils import (
    ExtractorError,
    clean_html,
-    get_element_text_and_html_by_tag,
    int_or_none,
    str_or_none,
-    traverse_obj,
-    try_call,
    unified_timestamp,
    urljoin,
 )
+from ..utils.traversal import find_element, traverse_obj


 class TBSJPEpisodeIE(InfoExtractor):
@ -64,7 +62,7 @@ class TBSJPEpisodeIE(InfoExtractor):
            self._merge_subtitles(subs, target=subtitles)

        return {
-            'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])),
+            'title': traverse_obj(webpage, ({find_element(tag='h3')}, {clean_html})),
            'id': video_id,
            **traverse_obj(episode, {
                'categories': ('keywords', {list}),
--- a/yt_dlp/extractor/telewebion.py
+++ b/yt_dlp/extractor/telewebion.py
@ -14,6 +14,7 @@ def _fmt_url(url):


 class TelewebionIE(InfoExtractor):
+    _WORKING = False
    _VALID_URL = r'https?://(?:www\.)?telewebion\.com/episode/(?P<id>(?:0x[a-fA-F\d]+|\d+))'
    _TESTS = [{
        'url': 'http://www.telewebion.com/episode/0x1b3139c/',
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -5142,6 +5142,7 @@ class _UnsafeExtensionError(Exception):
        'rm',
        'swf',
        'ts',
+        'vid',
        'vob',
        'vp9',

@ -5174,6 +5175,7 @@ class _UnsafeExtensionError(Exception):
        'heic',
        'ico',
        'image',
+        'jfif',
        'jng',
        'jpe',
        'jpeg',
--- a/yt_dlp/utils/traversal.py
+++ b/yt_dlp/utils/traversal.py
@ -20,6 +20,7 @@ from ._utils import (
    get_elements_html_by_class,
    get_elements_html_by_attribute,
    get_elements_by_attribute,
+    get_element_by_class,
    get_element_html_by_attribute,
    get_element_by_attribute,
    get_element_html_by_id,
@ -373,7 +374,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):


@typing.overload
-def find_element(*, attr: str, value: str, tag: str | None = None, html=False): ...
+def find_element(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ...


@typing.overload
@ -381,14 +382,14 @@ def find_element(*, cls: str, html=False): ...


@typing.overload
-def find_element(*, id: str, tag: str | None = None, html=False): ...
+def find_element(*, id: str, tag: str | None = None, html=False, regex=False): ...


@typing.overload
-def find_element(*, tag: str, html=False): ...
+def find_element(*, tag: str, html=False, regex=False): ...


-def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False):
+def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False, regex=False):
    # deliberately using `id=` and `cls=` for ease of readability
    assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required'
    ANY_TAG = r'[\w:.-]+'
@ -397,17 +398,18 @@ def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=Fal
        assert not cls, 'Cannot match both attr and cls'
        assert not id, 'Cannot match both attr and id'
        func = get_element_html_by_attribute if html else get_element_by_attribute
-        return functools.partial(func, attr, value, tag=tag or ANY_TAG)
+        return functools.partial(func, attr, value, tag=tag or ANY_TAG, escape_value=not regex)

    elif cls:
        assert not id, 'Cannot match both cls and id'
        assert tag is None, 'Cannot match both cls and tag'
-        func = get_element_html_by_class if html else get_elements_by_class
+        assert not regex, 'Cannot use regex with cls'
+        func = get_element_html_by_class if html else get_element_by_class
        return functools.partial(func, cls)

    elif id:
        func = get_element_html_by_id if html else get_element_by_id
-        return functools.partial(func, id, tag=tag or ANY_TAG)
+        return functools.partial(func, id, tag=tag or ANY_TAG, escape_value=not regex)

    index = int(bool(html))
    return lambda html: get_element_text_and_html_by_tag(tag, html)[index]
@ -418,19 +420,20 @@ def find_elements(*, cls: str, html=False): ...


@typing.overload
-def find_elements(*, attr: str, value: str, tag: str | None = None, html=False): ...
+def find_elements(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ...


-def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False):
+def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False, regex=False):
    # deliberately using `cls=` for ease of readability
    assert cls or (attr and value), 'One of cls or (attr AND value) is required'

    if attr and value:
        assert not cls, 'Cannot match both attr and cls'
        func = get_elements_html_by_attribute if html else get_elements_by_attribute
-        return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+')
+        return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+', escape_value=not regex)

    assert not tag, 'Cannot match both cls and tag'
+    assert not regex, 'Cannot use regex with cls'
    func = get_elements_html_by_class if html else get_elements_by_class
    return functools.partial(func, cls)
Author	SHA1	Message	Date
bashonly	758cadf123	[ie/learningonscreen] raise login required with new session_cookies method Authored by: bashonly	2024-11-03 13:28:06 -06:00
bashonly	ad786342e2	another one Authored by: bashonly	2024-11-03 13:25:17 -06:00
Simon Sawicki	e6226206fc	shrug	2024-11-03 20:23:35 +01:00
Simon Sawicki	21c9c265ca	Some more adjustments	2024-11-03 20:12:50 +01:00
bashonly	a198c6f8eb	Even more traversal helper cleanup Authored by: bashonly	2024-11-03 13:05:15 -06:00
bashonly	962edb31ae	Merge branch 'yt-dlp:master' into misc-cleanup-another-one	2024-11-03 12:21:22 -06:00
bashonly	b103aca24d	[utils] Fix and improve `find_element` and `find_elements` (#11443 ) Fix `d710a6ca7c` Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki <contact@grub4k.xyz>	2024-11-03 18:19:45 +00:00
bashonly	e705746e27	Allow videos with `.vid` extension	2024-11-03 10:31:29 -06:00
bashonly	4f7f711a7b	Allow thumbnails with `.jfif` extension Authored by: bashonly	2024-11-03 10:26:15 -06:00