Compare commits

...

2 Commits

Author SHA1 Message Date
grqx_wsl
901e78af62 improve regex 2024-11-04 14:19:52 +13:00
grqx_wsl
9a6f9843c0 use _extract_from_webpage and _extract_embed_urls
- `_extract_playlist_entries` is now a `classmethod`
- case insensitive html tag matching

Co-authored-by: dirkf <fieldhouse@gmx.net>
2024-11-04 14:09:42 +13:00

View File

@ -15,11 +15,12 @@ from ..utils import (
int_or_none, int_or_none,
join_nonempty, join_nonempty,
merge_dicts, merge_dicts,
orderedSet,
parse_count, parse_count,
parse_duration, parse_duration,
smuggle_url,
strip_or_none, strip_or_none,
unified_strdate, unified_strdate,
unsmuggle_url,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
@ -45,7 +46,7 @@ class BoomplayBaseIE(InfoExtractor):
""" """
# get_elements_text_and_html_by_attribute returns a generator # get_elements_text_and_html_by_attribute returns a generator
return get_elements_text_and_html_by_attribute( return get_elements_text_and_html_by_attribute(
'class', rf'''[^'"]*(?<=['"\s]){class_}(?=['"\s])[^'"]*''', html, attribute='class', value=rf'''[^'"]*(?<=['"\s]){class_}(?=['"\s])[^'"]*''', html=html,
tag=tag, escape_value=False) tag=tag, escape_value=False)
@classmethod @classmethod
@ -111,7 +112,7 @@ class BoomplayBaseIE(InfoExtractor):
metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or '' metadata_div = self._get_element_by_class_and_tag('summary', 'div', webpage) or ''
metadata_entries = re.findall(r'(?si)<strong>(?P<entry>.*?)</strong>', metadata_div) or [] metadata_entries = re.findall(r'(?si)<strong>(?P<entry>.*?)</strong>', metadata_div) or []
description = re.sub( description = re.sub(
'(?i)Listen and download music for free on Boomplay!', '', r'(?i)Listen and download music for free on Boomplay!', '',
clean_html(self._get_element_by_class_and_tag( clean_html(self._get_element_by_class_and_tag(
'description_content', 'span', webpage)) or '') or None 'description_content', 'span', webpage)) or '') or None
@ -145,39 +146,53 @@ class BoomplayBaseIE(InfoExtractor):
page_metadata['release_year'] = int_or_none(v) page_metadata['release_year'] = int_or_none(v)
return page_metadata return page_metadata
def _extract_suitable_links(self, webpage, media_types=None): @classmethod
if media_types is None: def _extract_from_webpage(cls, url, webpage, **kwargs):
media_types = self._MEDIA_TYPES if kwargs:
media_types = list(variadic(media_types)) url = smuggle_url(url, kwargs)
return super()._extract_from_webpage(url, webpage)
for idx, v in enumerate(media_types): @classmethod
media_types[idx] = re.escape(v) if v in self._MEDIA_TYPES else '' def _extract_embed_urls(cls, url, webpage):
media_types = join_nonempty(*media_types, delim='|') url, smuggled_data = unsmuggle_url(url)
return orderedSet(traverse_obj(re.finditer( media_types = variadic(smuggled_data.get('media_types', cls._MEDIA_TYPES))
rf'''(?x) media_types = join_nonempty(*(
re.escape(v)for v in media_types if v in cls._MEDIA_TYPES),
delim='|')
for mobj in re.finditer(
rf'''(?ix)
<a <a
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
(?<=\s)href\s*=\s*(?P<_q>['"]) (?<=\s)href\s*=\s*(?P<_q>['"])
(?: (?!javascript:)(?P<href>/(?:{media_types})/\d+/?[\-\w=?&#:;@]*)
(?!javascript:)(?P<link>/(?:{media_types})/\d+/?[\-a-zA-Z=?&#:;@]*)
)
(?P=_q) (?P=_q)
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
>''', webpage), (..., 'link', {self._urljoin}, {self.url_result}))) >''', webpage):
if url := cls._urljoin(mobj.group('href')):
yield url
def _extract_playlist_entries(self, webpage, media_types, warn=True): @classmethod
def _extract_playlist_entries(cls, webpage, media_types, warn=True):
song_list = strip_or_none( song_list = strip_or_none(
self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) cls._get_element_by_class_and_tag('morePart_musics', 'ol', webpage)
or self._get_element_by_class_and_tag('morePart', 'ol', webpage) or cls._get_element_by_class_and_tag('morePart', 'ol', webpage)
or '') or '')
entries = traverse_obj(self.__yield_elements_html_by_class_and_tag( entries = traverse_obj(cls.__yield_elements_html_by_class_and_tag(
'songName', 'a', song_list), 'songName', 'a', song_list),
(..., {extract_attributes}, 'href', {self._urljoin}, {self.url_result})) (..., {extract_attributes}, 'href', {cls._urljoin}, {cls.url_result}))
if not entries: if not entries:
if warn: if warn:
self.report_warning('Failed to extract playlist entries, finding suitable links instead!') cls.report_warning('Failed to extract playlist entries, finding suitable links instead!')
return self._extract_suitable_links(webpage, media_types)
def strip_ie(entry):
# All our IEs have a _VALID_URL and set a key: don't use it
entry.pop('ie_key', None)
return entry
return (strip_ie(result) for result in
cls._extract_from_webpage(cls._BASE, webpage, media_types=media_types))
return entries return entries
@ -302,7 +317,7 @@ class BoomplayPodcastIE(BoomplayBaseIE):
webpage = self._download_webpage(url, playlist_id) webpage = self._download_webpage(url, playlist_id)
song_list = self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage) song_list = self._get_element_by_class_and_tag('morePart_musics', 'ol', webpage)
song_list = traverse_obj(re.finditer( song_list = traverse_obj(re.finditer(
r'''(?x) r'''(?ix)
<li <li
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
\sdata-id\s*=\s* \sdata-id\s*=\s*