mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-11-28 10:11:25 +01:00
Compare commits
No commits in common. "758cadf1236604aa5556bbbe1e386c1758a2a003" and "fed6bcbaae6496ee5daa5645e12e6d9f7d05dd5d" have entirely different histories.
758cadf123
...
fed6bcbaae
|
@ -13,8 +13,6 @@ from yt_dlp.utils import (
|
||||||
str_or_none,
|
str_or_none,
|
||||||
)
|
)
|
||||||
from yt_dlp.utils.traversal import (
|
from yt_dlp.utils.traversal import (
|
||||||
find_element,
|
|
||||||
find_elements,
|
|
||||||
require,
|
require,
|
||||||
subs_list_to_dict,
|
subs_list_to_dict,
|
||||||
traverse_obj,
|
traverse_obj,
|
||||||
|
@ -39,14 +37,6 @@ _TEST_DATA = {
|
||||||
'dict': {},
|
'dict': {},
|
||||||
}
|
}
|
||||||
|
|
||||||
_TEST_HTML = '''<html><body>
|
|
||||||
<div class="a">1</div>
|
|
||||||
<div class="a" id="x" custom="z">2</div>
|
|
||||||
<div class="b" data-id="y" custom="z">3</div>
|
|
||||||
<p class="a">4</p>
|
|
||||||
<p id="d" custom="e">5</p>
|
|
||||||
</body></html>'''
|
|
||||||
|
|
||||||
|
|
||||||
class TestTraversal:
|
class TestTraversal:
|
||||||
def test_traversal_base(self):
|
def test_traversal_base(self):
|
||||||
|
@ -531,50 +521,6 @@ class TestTraversalHelpers:
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
unpack()
|
unpack()
|
||||||
|
|
||||||
def test_find_element(self):
|
|
||||||
for improper_kwargs in [
|
|
||||||
dict(attr='data-id'),
|
|
||||||
dict(value='y'),
|
|
||||||
dict(attr='data-id', value='y', cls='a'),
|
|
||||||
dict(attr='data-id', value='y', id='x'),
|
|
||||||
dict(cls='a', id='x'),
|
|
||||||
dict(cls='a', tag='p'),
|
|
||||||
dict(cls='[ab]', regex=True),
|
|
||||||
]:
|
|
||||||
with pytest.raises(AssertionError):
|
|
||||||
find_element(**improper_kwargs)(_TEST_HTML)
|
|
||||||
|
|
||||||
assert find_element(cls='a')(_TEST_HTML) == '1'
|
|
||||||
assert find_element(cls='a', html=True)(_TEST_HTML) == '<div class="a">1</div>'
|
|
||||||
assert find_element(id='x')(_TEST_HTML) == '2'
|
|
||||||
assert find_element(id='[ex]')(_TEST_HTML) is None
|
|
||||||
assert find_element(id='[ex]', regex=True)(_TEST_HTML) == '2'
|
|
||||||
assert find_element(id='x', html=True)(_TEST_HTML) == '<div class="a" id="x" custom="z">2</div>'
|
|
||||||
assert find_element(attr='data-id', value='y')(_TEST_HTML) == '3'
|
|
||||||
assert find_element(attr='data-id', value='y(?:es)?')(_TEST_HTML) is None
|
|
||||||
assert find_element(attr='data-id', value='y(?:es)?', regex=True)(_TEST_HTML) == '3'
|
|
||||||
assert find_element(
|
|
||||||
attr='data-id', value='y', html=True)(_TEST_HTML) == '<div class="b" data-id="y" custom="z">3</div>'
|
|
||||||
|
|
||||||
def test_find_elements(self):
|
|
||||||
for improper_kwargs in [
|
|
||||||
dict(tag='p'),
|
|
||||||
dict(attr='data-id'),
|
|
||||||
dict(value='y'),
|
|
||||||
dict(attr='data-id', value='y', cls='a'),
|
|
||||||
dict(cls='a', tag='div'),
|
|
||||||
dict(cls='[ab]', regex=True),
|
|
||||||
]:
|
|
||||||
with pytest.raises(AssertionError):
|
|
||||||
find_elements(**improper_kwargs)(_TEST_HTML)
|
|
||||||
|
|
||||||
assert find_elements(cls='a')(_TEST_HTML) == ['1', '2', '4']
|
|
||||||
assert find_elements(cls='a', html=True)(_TEST_HTML) == [
|
|
||||||
'<div class="a">1</div>', '<div class="a" id="x" custom="z">2</div>', '<p class="a">4</p>']
|
|
||||||
assert find_elements(attr='custom', value='z')(_TEST_HTML) == ['2', '3']
|
|
||||||
assert find_elements(attr='custom', value='[ez]')(_TEST_HTML) == []
|
|
||||||
assert find_elements(attr='custom', value='[ez]', regex=True)(_TEST_HTML) == ['2', '3', '5']
|
|
||||||
|
|
||||||
|
|
||||||
class TestDictGet:
|
class TestDictGet:
|
||||||
def test_dict_get(self):
|
def test_dict_get(self):
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import functools
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
|
@ -9,6 +10,7 @@ from ..utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
extract_attributes,
|
extract_attributes,
|
||||||
float_or_none,
|
float_or_none,
|
||||||
|
get_element_html_by_id,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
parse_filesize,
|
parse_filesize,
|
||||||
str_or_none,
|
str_or_none,
|
||||||
|
@ -19,7 +21,7 @@ from ..utils import (
|
||||||
url_or_none,
|
url_or_none,
|
||||||
urljoin,
|
urljoin,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import find_element, traverse_obj
|
from ..utils.traversal import traverse_obj
|
||||||
|
|
||||||
|
|
||||||
class BandcampIE(InfoExtractor):
|
class BandcampIE(InfoExtractor):
|
||||||
|
@ -509,7 +511,7 @@ class BandcampUserIE(InfoExtractor):
|
||||||
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
|
or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage))
|
||||||
|
|
||||||
yield from traverse_obj(webpage, (
|
yield from traverse_obj(webpage, (
|
||||||
{find_element(id='music-grid', html=True)}, {extract_attributes},
|
{functools.partial(get_element_html_by_id, 'music-grid')}, {extract_attributes},
|
||||||
'data-client-items', {json.loads}, ..., 'page_url', {str}))
|
'data-client-items', {json.loads}, ..., 'page_url', {str}))
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
|
|
|
@ -1,20 +1,35 @@
|
||||||
|
import functools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
clean_html,
|
clean_html,
|
||||||
extract_attributes,
|
extract_attributes,
|
||||||
|
get_element_text_and_html_by_tag,
|
||||||
|
get_elements_by_class,
|
||||||
join_nonempty,
|
join_nonempty,
|
||||||
js_to_json,
|
js_to_json,
|
||||||
mimetype2ext,
|
mimetype2ext,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
urljoin,
|
urljoin,
|
||||||
|
variadic,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import (
|
from ..utils.traversal import traverse_obj
|
||||||
find_element,
|
|
||||||
traverse_obj,
|
|
||||||
)
|
def html_get_element(tag=None, cls=None):
|
||||||
|
assert tag or cls, 'One of tag or class is required'
|
||||||
|
|
||||||
|
if cls:
|
||||||
|
func = functools.partial(get_elements_by_class, cls, tag=tag)
|
||||||
|
else:
|
||||||
|
func = functools.partial(get_element_text_and_html_by_tag, tag)
|
||||||
|
|
||||||
|
def html_get_element_wrapper(html):
|
||||||
|
return variadic(func(html))[0]
|
||||||
|
|
||||||
|
return html_get_element_wrapper
|
||||||
|
|
||||||
|
|
||||||
class BpbIE(InfoExtractor):
|
class BpbIE(InfoExtractor):
|
||||||
|
@ -26,12 +41,12 @@ class BpbIE(InfoExtractor):
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '297',
|
'id': '297',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'creators': ['Kooperative Berlin'],
|
'creator': 'Kooperative Berlin',
|
||||||
'description': r're:Joachim Gauck, .*\n\nKamera: .*',
|
'description': 'md5:f4f75885ba009d3e2b156247a8941ce6',
|
||||||
'release_date': '20150716',
|
'release_date': '20160115',
|
||||||
'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
|
'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
|
||||||
'tags': [],
|
'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'],
|
||||||
'thumbnail': r're:https?://www\.bpb\.de/cache/images/7/297_teaser_16x9_1240\.jpg.*',
|
'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D',
|
||||||
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
|
'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
|
||||||
'uploader': 'Bundeszentrale für politische Bildung',
|
'uploader': 'Bundeszentrale für politische Bildung',
|
||||||
},
|
},
|
||||||
|
@ -40,12 +55,11 @@ class BpbIE(InfoExtractor):
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '522184',
|
'id': '522184',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
|
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
|
||||||
'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
|
'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
|
||||||
'release_date': '20230621',
|
'release_date': '20230621',
|
||||||
'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
|
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
|
||||||
'tags': [],
|
'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB',
|
||||||
'thumbnail': r're:https://www\.bpb\.de/cache/images/4/522184_teaser_16x9_1240\.png.*',
|
|
||||||
'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
|
'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
|
||||||
'uploader': 'Bundeszentrale für politische Bildung',
|
'uploader': 'Bundeszentrale für politische Bildung',
|
||||||
},
|
},
|
||||||
|
@ -54,12 +68,11 @@ class BpbIE(InfoExtractor):
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '518789',
|
'id': '518789',
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'creators': ['Institute for Strategic Dialogue Germany gGmbH (ISD)'],
|
'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
|
||||||
'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
|
'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
|
||||||
'release_date': '20230302',
|
'release_date': '20230302',
|
||||||
'series': 'Narrative über den Krieg Russlands gegen die Ukraine (NUK)',
|
'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
|
||||||
'tags': [],
|
'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D',
|
||||||
'thumbnail': r're:https://www\.bpb\.de/cache/images/9/518789_teaser_16x9_1240\.jpeg.*',
|
|
||||||
'title': 'md5:3e956f264bb501f6383f10495a401da4',
|
'title': 'md5:3e956f264bb501f6383f10495a401da4',
|
||||||
'uploader': 'Bundeszentrale für politische Bildung',
|
'uploader': 'Bundeszentrale für politische Bildung',
|
||||||
},
|
},
|
||||||
|
@ -71,12 +84,12 @@ class BpbIE(InfoExtractor):
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '315813',
|
'id': '315813',
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
'creators': ['Axel Schröder'],
|
'creator': 'Axel Schröder',
|
||||||
'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
|
'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
|
||||||
'release_date': '20200921',
|
'release_date': '20200921',
|
||||||
'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
|
'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
|
||||||
'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
|
'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
|
||||||
'thumbnail': r're:https://www\.bpb\.de/cache/images/3/315813_teaser_16x9_1240\.png.*',
|
'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94',
|
||||||
'title': 'Folge 1: Eine Einführung',
|
'title': 'Folge 1: Eine Einführung',
|
||||||
'uploader': 'Bundeszentrale für politische Bildung',
|
'uploader': 'Bundeszentrale für politische Bildung',
|
||||||
},
|
},
|
||||||
|
@ -85,12 +98,12 @@ class BpbIE(InfoExtractor):
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '517806',
|
'id': '517806',
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
'creators': ['Bundeszentrale für politische Bildung'],
|
'creator': 'Bundeszentrale für politische Bildung',
|
||||||
'description': 'md5:594689600e919912aade0b2871cc3fed',
|
'description': 'md5:594689600e919912aade0b2871cc3fed',
|
||||||
'release_date': '20230127',
|
'release_date': '20230127',
|
||||||
'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
|
'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
|
||||||
'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
|
'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
|
||||||
'thumbnail': r're:https://www\.bpb\.de/cache/images/6/517806_teaser_16x9_1240\.png.*',
|
'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0',
|
||||||
'title': 'Die Weltanschauung der "Neuen Rechten"',
|
'title': 'Die Weltanschauung der "Neuen Rechten"',
|
||||||
'uploader': 'Bundeszentrale für politische Bildung',
|
'uploader': 'Bundeszentrale für politische Bildung',
|
||||||
},
|
},
|
||||||
|
@ -134,7 +147,7 @@ class BpbIE(InfoExtractor):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
title_result = traverse_obj(webpage, ({find_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
|
title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
|
||||||
json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
|
json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -143,10 +156,10 @@ class BpbIE(InfoExtractor):
|
||||||
# This metadata could be interpreted otherwise, but it fits "series" the most
|
# This metadata could be interpreted otherwise, but it fits "series" the most
|
||||||
'series': traverse_obj(title_result, ('series', {str.strip})) or None,
|
'series': traverse_obj(title_result, ('series', {str.strip})) or None,
|
||||||
'description': join_nonempty(*traverse_obj(webpage, [(
|
'description': join_nonempty(*traverse_obj(webpage, [(
|
||||||
{find_element(cls='opening-intro')},
|
{html_get_element(cls='opening-intro')},
|
||||||
[{find_element(tag='bpb-accordion-item')}, {find_element(cls='text-content')}],
|
[{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}],
|
||||||
), {clean_html}]), delim='\n\n') or None,
|
), {clean_html}]), delim='\n\n') or None,
|
||||||
'creators': traverse_obj(self._html_search_meta('author', webpage), all),
|
'creator': self._html_search_meta('author', webpage),
|
||||||
'uploader': self._html_search_meta('publisher', webpage),
|
'uploader': self._html_search_meta('publisher', webpage),
|
||||||
'release_date': unified_strdate(self._html_search_meta('date', webpage)),
|
'release_date': unified_strdate(self._html_search_meta('date', webpage)),
|
||||||
'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
|
'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
|
||||||
|
|
|
@ -8,13 +8,11 @@ from ..utils import (
|
||||||
bug_reports_message,
|
bug_reports_message,
|
||||||
clean_html,
|
clean_html,
|
||||||
format_field,
|
format_field,
|
||||||
|
get_element_text_and_html_by_tag,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import (
|
from ..utils.traversal import traverse_obj
|
||||||
find_element,
|
|
||||||
traverse_obj,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class BundestagIE(InfoExtractor):
|
class BundestagIE(InfoExtractor):
|
||||||
|
@ -117,8 +115,9 @@ class BundestagIE(InfoExtractor):
|
||||||
note='Downloading metadata overlay', fatal=False,
|
note='Downloading metadata overlay', fatal=False,
|
||||||
), {
|
), {
|
||||||
'title': (
|
'title': (
|
||||||
{find_element(tag='h3')}, {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
|
{functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
|
||||||
'description': ({find_element(tag='p')}, {clean_html}),
|
{functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
|
||||||
|
'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
|
||||||
}))
|
}))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
|
@ -6,11 +6,13 @@ from ..utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
clean_html,
|
clean_html,
|
||||||
extract_attributes,
|
extract_attributes,
|
||||||
|
get_element_by_class,
|
||||||
|
get_element_html_by_id,
|
||||||
join_nonempty,
|
join_nonempty,
|
||||||
parse_duration,
|
parse_duration,
|
||||||
unified_timestamp,
|
unified_timestamp,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import find_element, traverse_obj
|
from ..utils.traversal import traverse_obj
|
||||||
|
|
||||||
|
|
||||||
class LearningOnScreenIE(InfoExtractor):
|
class LearningOnScreenIE(InfoExtractor):
|
||||||
|
@ -30,24 +32,28 @@ class LearningOnScreenIE(InfoExtractor):
|
||||||
|
|
||||||
def _real_initialize(self):
|
def _real_initialize(self):
|
||||||
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
|
if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
|
||||||
self.raise_login_required(method='session_cookies')
|
self.raise_login_required(
|
||||||
|
'Use --cookies for authentication. See '
|
||||||
|
' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp '
|
||||||
|
'for how to manually pass cookies', method=None)
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
details = traverse_obj(webpage, (
|
details = traverse_obj(webpage, (
|
||||||
{find_element(id='programme-details', html=True)}, {
|
{functools.partial(get_element_html_by_id, 'programme-details')}, {
|
||||||
'title': ({find_element(tag='h2')}, {clean_html}),
|
'title': ({functools.partial(re.search, r'<h2>([^<]+)</h2>')}, 1, {clean_html}),
|
||||||
'timestamp': (
|
'timestamp': (
|
||||||
{find_element(cls='broadcast-date')},
|
{functools.partial(get_element_by_class, 'broadcast-date')},
|
||||||
{functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}),
|
{functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}),
|
||||||
'duration': (
|
'duration': (
|
||||||
{find_element(cls='prog-running-time')}, {clean_html}, {parse_duration}),
|
{functools.partial(get_element_by_class, 'prog-running-time')},
|
||||||
|
{clean_html}, {parse_duration}),
|
||||||
}))
|
}))
|
||||||
|
|
||||||
title = details.pop('title', None) or traverse_obj(webpage, (
|
title = details.pop('title', None) or traverse_obj(webpage, (
|
||||||
{find_element(id='add-to-existing-playlist', html=True)},
|
{functools.partial(get_element_html_by_id, 'add-to-existing-playlist')},
|
||||||
{extract_attributes}, 'data-record-title', {clean_html}))
|
{extract_attributes}, 'data-record-title', {clean_html}))
|
||||||
|
|
||||||
entries = self._parse_html5_media_entries(
|
entries = self._parse_html5_media_entries(
|
||||||
|
|
|
@ -6,10 +6,12 @@ from ..utils import (
|
||||||
extract_attributes,
|
extract_attributes,
|
||||||
get_element_by_class,
|
get_element_by_class,
|
||||||
get_element_html_by_id,
|
get_element_html_by_id,
|
||||||
|
get_element_text_and_html_by_tag,
|
||||||
parse_duration,
|
parse_duration,
|
||||||
strip_or_none,
|
strip_or_none,
|
||||||
|
traverse_obj,
|
||||||
|
try_call,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import find_element, traverse_obj
|
|
||||||
|
|
||||||
|
|
||||||
class ListenNotesIE(InfoExtractor):
|
class ListenNotesIE(InfoExtractor):
|
||||||
|
@ -20,14 +22,14 @@ class ListenNotesIE(InfoExtractor):
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'KrDgvNb_u1n',
|
'id': 'KrDgvNb_u1n',
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
'title': r're:Tim O’Reilly on noticing things other people .{113}',
|
'title': 'md5:32236591a921adf17bbdbf0441b6c0e9',
|
||||||
'description': r're:(?s)‘’We shape reality by what we notice and .{27459}',
|
'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd',
|
||||||
'duration': 2215.0,
|
'duration': 2148.0,
|
||||||
'channel': 'Amplifying Cognition',
|
'channel': 'Thriving on Overload',
|
||||||
'channel_id': 'ed84wITivxF',
|
'channel_id': 'ed84wITivxF',
|
||||||
'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
|
'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
|
||||||
'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/amplifying-cognition-ross-dawson-Iemft4Gdr0k-ed84wITivxF.300x300.jpg',
|
'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg',
|
||||||
'channel_url': 'https://www.listennotes.com/podcasts/amplifying-cognition-ross-dawson-ed84wITivxF/',
|
'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/',
|
||||||
'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
|
'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
|
@ -37,13 +39,13 @@ class ListenNotesIE(InfoExtractor):
|
||||||
'id': 'lwEA3154JzG',
|
'id': 'lwEA3154JzG',
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
'title': 'Episode 177: WireGuard with Jason Donenfeld',
|
'title': 'Episode 177: WireGuard with Jason Donenfeld',
|
||||||
'description': r're:(?s)Jason Donenfeld lead developer joins us this hour to discuss WireGuard, .{3169}',
|
'description': 'md5:24744f36456a3e95f83c1193a3458594',
|
||||||
'duration': 3861.0,
|
'duration': 3861.0,
|
||||||
'channel': 'Ask Noah Show',
|
'channel': 'Ask Noah Show',
|
||||||
'channel_id': '4DQTzdS5-j7',
|
'channel_id': '4DQTzdS5-j7',
|
||||||
'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
|
'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
|
||||||
'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
|
'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
|
||||||
'thumbnail': 'https://cdn-images-3.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-gD7vG150cxf-4DQTzdS5-j7.300x300.jpg',
|
'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg',
|
||||||
'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
|
'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
|
||||||
},
|
},
|
||||||
}]
|
}]
|
||||||
|
@ -68,7 +70,7 @@ class ListenNotesIE(InfoExtractor):
|
||||||
'id': audio_id,
|
'id': audio_id,
|
||||||
'url': data['audio'],
|
'url': data['audio'],
|
||||||
'title': (data.get('data-title')
|
'title': (data.get('data-title')
|
||||||
or traverse_obj(webpage, ({find_element(tag='h1')}, {clean_html}))
|
or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
|
||||||
or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
|
or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
|
||||||
'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
|
'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
|
||||||
or strip_or_none(description)),
|
or strip_or_none(description)),
|
||||||
|
|
|
@ -4,11 +4,15 @@ from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
clean_html,
|
clean_html,
|
||||||
extract_attributes,
|
extract_attributes,
|
||||||
|
get_element_by_class,
|
||||||
|
get_element_html_by_class,
|
||||||
|
get_element_text_and_html_by_tag,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
strip_or_none,
|
strip_or_none,
|
||||||
|
traverse_obj,
|
||||||
|
try_call,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import find_element, traverse_obj
|
|
||||||
|
|
||||||
|
|
||||||
class MonstercatIE(InfoExtractor):
|
class MonstercatIE(InfoExtractor):
|
||||||
|
@ -22,21 +26,19 @@ class MonstercatIE(InfoExtractor):
|
||||||
'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
|
'thumbnail': 'https://www.monstercat.com/release/742779548009/cover',
|
||||||
'release_date': '20230711',
|
'release_date': '20230711',
|
||||||
'album': 'The Secret Language of Trees',
|
'album': 'The Secret Language of Trees',
|
||||||
'album_artists': ['BT'],
|
'album_artist': 'BT',
|
||||||
},
|
},
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _extract_tracks(self, table, album_meta):
|
def _extract_tracks(self, table, album_meta):
|
||||||
for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag
|
for td in re.findall(r'<tr[^<]*>((?:(?!</tr>)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag
|
||||||
title = traverse_obj(td, (
|
title = clean_html(try_call(
|
||||||
{find_element(cls='d-inline-flex flex-column')},
|
lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' <span')[0]))
|
||||||
{lambda x: x.partition(' <span')}, 0, {clean_html}))
|
ids = extract_attributes(try_call(lambda: get_element_html_by_class('btn-play cursor-pointer mr-small', td)) or '')
|
||||||
ids = traverse_obj(td, (
|
|
||||||
{find_element(cls='btn-play cursor-pointer mr-small', html=True)}, {extract_attributes})) or {}
|
|
||||||
track_id = ids.get('data-track-id')
|
track_id = ids.get('data-track-id')
|
||||||
release_id = ids.get('data-release-id')
|
release_id = ids.get('data-release-id')
|
||||||
|
|
||||||
track_number = traverse_obj(td, ({find_element(cls='py-xsmall')}, {int_or_none}))
|
track_number = int_or_none(try_call(lambda: get_element_by_class('py-xsmall', td)))
|
||||||
if not track_id or not release_id:
|
if not track_id or not release_id:
|
||||||
self.report_warning(f'Skipping track {track_number}, ID(s) not found')
|
self.report_warning(f'Skipping track {track_number}, ID(s) not found')
|
||||||
self.write_debug(f'release_id={release_id!r} track_id={track_id!r}')
|
self.write_debug(f'release_id={release_id!r} track_id={track_id!r}')
|
||||||
|
@ -46,7 +48,7 @@ class MonstercatIE(InfoExtractor):
|
||||||
'title': title,
|
'title': title,
|
||||||
'track': title,
|
'track': title,
|
||||||
'track_number': track_number,
|
'track_number': track_number,
|
||||||
'artists': traverse_obj(td, ({find_element(cls='d-block fs-xxsmall')}, {clean_html}, all)),
|
'artist': clean_html(try_call(lambda: get_element_by_class('d-block fs-xxsmall', td))),
|
||||||
'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
|
'url': f'https://www.monstercat.com/api/release/{release_id}/track-stream/{track_id}',
|
||||||
'id': track_id,
|
'id': track_id,
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
|
@ -55,19 +57,20 @@ class MonstercatIE(InfoExtractor):
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
url_id = self._match_id(url)
|
url_id = self._match_id(url)
|
||||||
html = self._download_webpage(url, url_id)
|
html = self._download_webpage(url, url_id)
|
||||||
# NB: HTMLParser may choke on this html; use {find_element} or try_call(lambda: get_element...)
|
# wrap all `get_elements` in `try_call`, HTMLParser has problems with site's html
|
||||||
tracklist_table = traverse_obj(html, {find_element(cls='table table-small')}) or ''
|
tracklist_table = try_call(lambda: get_element_by_class('table table-small', html)) or ''
|
||||||
title = traverse_obj(html, ({find_element(tag='h1')}, {clean_html}))
|
|
||||||
|
title = try_call(lambda: get_element_text_and_html_by_tag('h1', html)[0])
|
||||||
|
date = traverse_obj(html, ({lambda html: get_element_by_class('font-italic mb-medium d-tablet-none d-phone-block',
|
||||||
|
html).partition('Released ')}, 2, {strip_or_none}, {unified_strdate}))
|
||||||
|
|
||||||
album_meta = {
|
album_meta = {
|
||||||
'title': title,
|
'title': title,
|
||||||
'album': title,
|
'album': title,
|
||||||
'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
|
'thumbnail': f'https://www.monstercat.com/release/{url_id}/cover',
|
||||||
'album_artists': traverse_obj(html, (
|
'album_artist': try_call(
|
||||||
{find_element(cls='h-normal text-uppercase mb-desktop-medium mb-smallish')}, {clean_html}, all)),
|
lambda: get_element_by_class('h-normal text-uppercase mb-desktop-medium mb-smallish', html)),
|
||||||
'release_date': traverse_obj(html, (
|
'release_date': date,
|
||||||
{find_element(cls='font-italic mb-medium d-tablet-none d-phone-block')},
|
|
||||||
{lambda x: x.partition('Released ')}, 2, {strip_or_none}, {unified_strdate})),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return self.playlist_result(
|
return self.playlist_result(
|
||||||
|
|
|
@ -6,10 +6,12 @@ from ..utils import (
|
||||||
determine_ext,
|
determine_ext,
|
||||||
extract_attributes,
|
extract_attributes,
|
||||||
get_element_by_class,
|
get_element_by_class,
|
||||||
|
get_element_text_and_html_by_tag,
|
||||||
parse_duration,
|
parse_duration,
|
||||||
|
traverse_obj,
|
||||||
|
try_call,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import find_element, traverse_obj
|
|
||||||
|
|
||||||
|
|
||||||
class NekoHackerIE(InfoExtractor):
|
class NekoHackerIE(InfoExtractor):
|
||||||
|
@ -33,7 +35,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20221101',
|
'release_date': '20221101',
|
||||||
'album': 'Nekoverse',
|
'album': 'Nekoverse',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': 'Spaceship',
|
'track': 'Spaceship',
|
||||||
'track_number': 1,
|
'track_number': 1,
|
||||||
'duration': 195.0,
|
'duration': 195.0,
|
||||||
|
@ -51,7 +53,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20221101',
|
'release_date': '20221101',
|
||||||
'album': 'Nekoverse',
|
'album': 'Nekoverse',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': 'City Runner',
|
'track': 'City Runner',
|
||||||
'track_number': 2,
|
'track_number': 2,
|
||||||
'duration': 148.0,
|
'duration': 148.0,
|
||||||
|
@ -69,7 +71,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20221101',
|
'release_date': '20221101',
|
||||||
'album': 'Nekoverse',
|
'album': 'Nekoverse',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': 'Nature Talk',
|
'track': 'Nature Talk',
|
||||||
'track_number': 3,
|
'track_number': 3,
|
||||||
'duration': 174.0,
|
'duration': 174.0,
|
||||||
|
@ -87,7 +89,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20221101',
|
'release_date': '20221101',
|
||||||
'album': 'Nekoverse',
|
'album': 'Nekoverse',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': 'Crystal World',
|
'track': 'Crystal World',
|
||||||
'track_number': 4,
|
'track_number': 4,
|
||||||
'duration': 199.0,
|
'duration': 199.0,
|
||||||
|
@ -113,7 +115,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20210115',
|
'release_date': '20210115',
|
||||||
'album': '進め!むじなカンパニー',
|
'album': '進め!むじなカンパニー',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0',
|
'track': 'md5:1a5fcbc96ca3c3265b1c6f9f79f30fd0',
|
||||||
'track_number': 1,
|
'track_number': 1,
|
||||||
},
|
},
|
||||||
|
@ -130,7 +132,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20210115',
|
'release_date': '20210115',
|
||||||
'album': '進め!むじなカンパニー',
|
'album': '進め!むじなカンパニー',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )',
|
'track': 'むじな de なじむ feat. 六科なじむ (CV: 日高里菜 )',
|
||||||
'track_number': 2,
|
'track_number': 2,
|
||||||
},
|
},
|
||||||
|
@ -147,7 +149,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20210115',
|
'release_date': '20210115',
|
||||||
'album': '進め!むじなカンパニー',
|
'album': '進め!むじなカンパニー',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': '進め!むじなカンパニー (instrumental)',
|
'track': '進め!むじなカンパニー (instrumental)',
|
||||||
'track_number': 3,
|
'track_number': 3,
|
||||||
},
|
},
|
||||||
|
@ -164,7 +166,7 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'acodec': 'mp3',
|
'acodec': 'mp3',
|
||||||
'release_date': '20210115',
|
'release_date': '20210115',
|
||||||
'album': '進め!むじなカンパニー',
|
'album': '進め!むじなカンパニー',
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'track': 'むじな de なじむ (instrumental)',
|
'track': 'むじな de なじむ (instrumental)',
|
||||||
'track_number': 4,
|
'track_number': 4,
|
||||||
},
|
},
|
||||||
|
@ -179,17 +181,14 @@ class NekoHackerIE(InfoExtractor):
|
||||||
playlist = get_element_by_class('playlist', webpage)
|
playlist = get_element_by_class('playlist', webpage)
|
||||||
|
|
||||||
if not playlist:
|
if not playlist:
|
||||||
iframe_src = traverse_obj(webpage, (
|
iframe = try_call(lambda: get_element_text_and_html_by_tag('iframe', webpage)[1]) or ''
|
||||||
{find_element(tag='iframe', html=True)}, {extract_attributes}, 'src', {url_or_none}))
|
iframe_src = url_or_none(extract_attributes(iframe).get('src'))
|
||||||
if not iframe_src:
|
if not iframe_src:
|
||||||
raise ExtractorError('No playlist or embed found in webpage')
|
raise ExtractorError('No playlist or embed found in webpage')
|
||||||
elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src):
|
elif re.match(r'https?://(?:\w+\.)?spotify\.com/', iframe_src):
|
||||||
raise ExtractorError('Spotify embeds are not supported', expected=True)
|
raise ExtractorError('Spotify embeds are not supported', expected=True)
|
||||||
return self.url_result(url, 'Generic')
|
return self.url_result(url, 'Generic')
|
||||||
|
|
||||||
player_params = self._search_json(
|
|
||||||
r'var srp_player_params_[\da-f]+\s*=', webpage, 'player params', playlist_id, default={})
|
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1):
|
for track_number, track in enumerate(re.findall(r'(<li[^>]+data-audiopath[^>]+>)', playlist), 1):
|
||||||
entry = traverse_obj(extract_attributes(track), {
|
entry = traverse_obj(extract_attributes(track), {
|
||||||
|
@ -201,12 +200,12 @@ class NekoHackerIE(InfoExtractor):
|
||||||
'album': 'data-albumtitle',
|
'album': 'data-albumtitle',
|
||||||
'duration': ('data-tracktime', {parse_duration}),
|
'duration': ('data-tracktime', {parse_duration}),
|
||||||
'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0),
|
'release_date': ('data-releasedate', {lambda x: re.match(r'\d{8}', x.replace('.', ''))}, 0),
|
||||||
|
'thumbnail': ('data-albumart', {url_or_none}),
|
||||||
})
|
})
|
||||||
entries.append({
|
entries.append({
|
||||||
**entry,
|
**entry,
|
||||||
'thumbnail': url_or_none(player_params.get('artwork')),
|
|
||||||
'track_number': track_number,
|
'track_number': track_number,
|
||||||
'artists': ['Neko Hacker'],
|
'artist': 'Neko Hacker',
|
||||||
'vcodec': 'none',
|
'vcodec': 'none',
|
||||||
'acodec': 'mp3' if entry['ext'] == 'mp3' else None,
|
'acodec': 'mp3' if entry['ext'] == 'mp3' else None,
|
||||||
})
|
})
|
||||||
|
|
|
@ -10,10 +10,10 @@ from ..utils import (
|
||||||
get_element_html_by_class,
|
get_element_html_by_class,
|
||||||
get_elements_by_class,
|
get_elements_by_class,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
|
try_call,
|
||||||
unified_timestamp,
|
unified_timestamp,
|
||||||
urlencode_postdata,
|
urlencode_postdata,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import find_element, find_elements, traverse_obj
|
|
||||||
|
|
||||||
|
|
||||||
class NubilesPornIE(InfoExtractor):
|
class NubilesPornIE(InfoExtractor):
|
||||||
|
@ -70,8 +70,9 @@ class NubilesPornIE(InfoExtractor):
|
||||||
url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0]
|
url, get_element_by_class('watch-page-video-wrapper', page), video_id)[0]
|
||||||
|
|
||||||
channel_id, channel_name = self._search_regex(
|
channel_id, channel_name = self._search_regex(
|
||||||
r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page) or '',
|
r'/video/website/(?P<id>\d+).+>(?P<name>\w+).com', get_element_html_by_class('site-link', page),
|
||||||
'channel', fatal=False, group=('id', 'name')) or (None, None)
|
'channel', fatal=False, group=('id', 'name')) or (None, None)
|
||||||
|
channel_name = re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
|
@ -81,14 +82,14 @@ class NubilesPornIE(InfoExtractor):
|
||||||
'thumbnail': media_entries.get('thumbnail'),
|
'thumbnail': media_entries.get('thumbnail'),
|
||||||
'description': clean_html(get_element_html_by_class('content-pane-description', page)),
|
'description': clean_html(get_element_html_by_class('content-pane-description', page)),
|
||||||
'timestamp': unified_timestamp(get_element_by_class('date', page)),
|
'timestamp': unified_timestamp(get_element_by_class('date', page)),
|
||||||
'channel': re.sub(r'([^A-Z]+)([A-Z]+)', r'\1 \2', channel_name) if channel_name else None,
|
'channel': channel_name,
|
||||||
'channel_id': channel_id,
|
'channel_id': channel_id,
|
||||||
'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'),
|
'channel_url': format_field(channel_id, None, 'https://members.nubiles-porn.com/video/website/%s'),
|
||||||
'like_count': int_or_none(get_element_by_id('likecount', page)),
|
'like_count': int_or_none(get_element_by_id('likecount', page)),
|
||||||
'average_rating': float_or_none(get_element_by_class('score', page)),
|
'average_rating': float_or_none(get_element_by_class('score', page)),
|
||||||
'age_limit': 18,
|
'age_limit': 18,
|
||||||
'categories': traverse_obj(page, ({find_element(cls='categories')}, {find_elements(cls='btn')}, ..., {clean_html})),
|
'categories': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_element_by_class('categories', page))))),
|
||||||
'tags': traverse_obj(page, ({find_elements(cls='tags')}, 1, {find_elements(cls='btn')}, ..., {clean_html})),
|
'tags': try_call(lambda: list(map(clean_html, get_elements_by_class('btn', get_elements_by_class('tags', page)[1])))),
|
||||||
'cast': get_elements_by_class('content-pane-performer', page),
|
'cast': get_elements_by_class('content-pane-performer', page),
|
||||||
'availability': 'needs_auth',
|
'availability': 'needs_auth',
|
||||||
'series': channel_name,
|
'series': channel_name,
|
||||||
|
|
|
@ -3,12 +3,14 @@ from ..networking.exceptions import HTTPError
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
clean_html,
|
clean_html,
|
||||||
|
get_element_text_and_html_by_tag,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
str_or_none,
|
str_or_none,
|
||||||
|
traverse_obj,
|
||||||
|
try_call,
|
||||||
unified_timestamp,
|
unified_timestamp,
|
||||||
urljoin,
|
urljoin,
|
||||||
)
|
)
|
||||||
from ..utils.traversal import find_element, traverse_obj
|
|
||||||
|
|
||||||
|
|
||||||
class TBSJPEpisodeIE(InfoExtractor):
|
class TBSJPEpisodeIE(InfoExtractor):
|
||||||
|
@ -62,7 +64,7 @@ class TBSJPEpisodeIE(InfoExtractor):
|
||||||
self._merge_subtitles(subs, target=subtitles)
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'title': traverse_obj(webpage, ({find_element(tag='h3')}, {clean_html})),
|
'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])),
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
**traverse_obj(episode, {
|
**traverse_obj(episode, {
|
||||||
'categories': ('keywords', {list}),
|
'categories': ('keywords', {list}),
|
||||||
|
|
|
@ -14,7 +14,6 @@ def _fmt_url(url):
|
||||||
|
|
||||||
|
|
||||||
class TelewebionIE(InfoExtractor):
|
class TelewebionIE(InfoExtractor):
|
||||||
_WORKING = False
|
|
||||||
_VALID_URL = r'https?://(?:www\.)?telewebion\.com/episode/(?P<id>(?:0x[a-fA-F\d]+|\d+))'
|
_VALID_URL = r'https?://(?:www\.)?telewebion\.com/episode/(?P<id>(?:0x[a-fA-F\d]+|\d+))'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'http://www.telewebion.com/episode/0x1b3139c/',
|
'url': 'http://www.telewebion.com/episode/0x1b3139c/',
|
||||||
|
|
|
@ -5142,7 +5142,6 @@ class _UnsafeExtensionError(Exception):
|
||||||
'rm',
|
'rm',
|
||||||
'swf',
|
'swf',
|
||||||
'ts',
|
'ts',
|
||||||
'vid',
|
|
||||||
'vob',
|
'vob',
|
||||||
'vp9',
|
'vp9',
|
||||||
|
|
||||||
|
@ -5175,7 +5174,6 @@ class _UnsafeExtensionError(Exception):
|
||||||
'heic',
|
'heic',
|
||||||
'ico',
|
'ico',
|
||||||
'image',
|
'image',
|
||||||
'jfif',
|
|
||||||
'jng',
|
'jng',
|
||||||
'jpe',
|
'jpe',
|
||||||
'jpeg',
|
'jpeg',
|
||||||
|
|
|
@ -20,7 +20,6 @@ from ._utils import (
|
||||||
get_elements_html_by_class,
|
get_elements_html_by_class,
|
||||||
get_elements_html_by_attribute,
|
get_elements_html_by_attribute,
|
||||||
get_elements_by_attribute,
|
get_elements_by_attribute,
|
||||||
get_element_by_class,
|
|
||||||
get_element_html_by_attribute,
|
get_element_html_by_attribute,
|
||||||
get_element_by_attribute,
|
get_element_by_attribute,
|
||||||
get_element_html_by_id,
|
get_element_html_by_id,
|
||||||
|
@ -374,7 +373,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None):
|
||||||
|
|
||||||
|
|
||||||
@typing.overload
|
@typing.overload
|
||||||
def find_element(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ...
|
def find_element(*, attr: str, value: str, tag: str | None = None, html=False): ...
|
||||||
|
|
||||||
|
|
||||||
@typing.overload
|
@typing.overload
|
||||||
|
@ -382,14 +381,14 @@ def find_element(*, cls: str, html=False): ...
|
||||||
|
|
||||||
|
|
||||||
@typing.overload
|
@typing.overload
|
||||||
def find_element(*, id: str, tag: str | None = None, html=False, regex=False): ...
|
def find_element(*, id: str, tag: str | None = None, html=False): ...
|
||||||
|
|
||||||
|
|
||||||
@typing.overload
|
@typing.overload
|
||||||
def find_element(*, tag: str, html=False, regex=False): ...
|
def find_element(*, tag: str, html=False): ...
|
||||||
|
|
||||||
|
|
||||||
def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False, regex=False):
|
def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False):
|
||||||
# deliberately using `id=` and `cls=` for ease of readability
|
# deliberately using `id=` and `cls=` for ease of readability
|
||||||
assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required'
|
assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required'
|
||||||
ANY_TAG = r'[\w:.-]+'
|
ANY_TAG = r'[\w:.-]+'
|
||||||
|
@ -398,18 +397,17 @@ def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=Fal
|
||||||
assert not cls, 'Cannot match both attr and cls'
|
assert not cls, 'Cannot match both attr and cls'
|
||||||
assert not id, 'Cannot match both attr and id'
|
assert not id, 'Cannot match both attr and id'
|
||||||
func = get_element_html_by_attribute if html else get_element_by_attribute
|
func = get_element_html_by_attribute if html else get_element_by_attribute
|
||||||
return functools.partial(func, attr, value, tag=tag or ANY_TAG, escape_value=not regex)
|
return functools.partial(func, attr, value, tag=tag or ANY_TAG)
|
||||||
|
|
||||||
elif cls:
|
elif cls:
|
||||||
assert not id, 'Cannot match both cls and id'
|
assert not id, 'Cannot match both cls and id'
|
||||||
assert tag is None, 'Cannot match both cls and tag'
|
assert tag is None, 'Cannot match both cls and tag'
|
||||||
assert not regex, 'Cannot use regex with cls'
|
func = get_element_html_by_class if html else get_elements_by_class
|
||||||
func = get_element_html_by_class if html else get_element_by_class
|
|
||||||
return functools.partial(func, cls)
|
return functools.partial(func, cls)
|
||||||
|
|
||||||
elif id:
|
elif id:
|
||||||
func = get_element_html_by_id if html else get_element_by_id
|
func = get_element_html_by_id if html else get_element_by_id
|
||||||
return functools.partial(func, id, tag=tag or ANY_TAG, escape_value=not regex)
|
return functools.partial(func, id, tag=tag or ANY_TAG)
|
||||||
|
|
||||||
index = int(bool(html))
|
index = int(bool(html))
|
||||||
return lambda html: get_element_text_and_html_by_tag(tag, html)[index]
|
return lambda html: get_element_text_and_html_by_tag(tag, html)[index]
|
||||||
|
@ -420,20 +418,19 @@ def find_elements(*, cls: str, html=False): ...
|
||||||
|
|
||||||
|
|
||||||
@typing.overload
|
@typing.overload
|
||||||
def find_elements(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ...
|
def find_elements(*, attr: str, value: str, tag: str | None = None, html=False): ...
|
||||||
|
|
||||||
|
|
||||||
def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False, regex=False):
|
def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False):
|
||||||
# deliberately using `cls=` for ease of readability
|
# deliberately using `cls=` for ease of readability
|
||||||
assert cls or (attr and value), 'One of cls or (attr AND value) is required'
|
assert cls or (attr and value), 'One of cls or (attr AND value) is required'
|
||||||
|
|
||||||
if attr and value:
|
if attr and value:
|
||||||
assert not cls, 'Cannot match both attr and cls'
|
assert not cls, 'Cannot match both attr and cls'
|
||||||
func = get_elements_html_by_attribute if html else get_elements_by_attribute
|
func = get_elements_html_by_attribute if html else get_elements_by_attribute
|
||||||
return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+', escape_value=not regex)
|
return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+')
|
||||||
|
|
||||||
assert not tag, 'Cannot match both cls and tag'
|
assert not tag, 'Cannot match both cls and tag'
|
||||||
assert not regex, 'Cannot use regex with cls'
|
|
||||||
func = get_elements_html_by_class if html else get_elements_by_class
|
func = get_elements_html_by_class if html else get_elements_by_class
|
||||||
return functools.partial(func, cls)
|
return functools.partial(func, cls)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user