Compare commits

...

3 Commits

Author SHA1 Message Date
DmitryScaletta
2416fddcfb
[PromoDJ] Add codecs 2024-02-17 05:24:49 +03:00
DmitryScaletta
49ac5d31a3
[PromoDJ] Update radio extractor and add tests 2024-02-17 05:06:06 +03:00
DmitryScaletta
e32ba3fc21
[PromoDJ] Fix login regex 2024-02-17 04:16:05 +03:00

View File

@ -6,16 +6,17 @@ import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..utils import ( from ..utils import (
OnDemandPagedList,
clean_html, clean_html,
dict_get, dict_get,
extract_attributes, extract_attributes,
ExtractorError, ExtractorError,
get_element_by_class, get_element_by_class,
get_element_html_by_id,
get_elements_html_by_class, get_elements_html_by_class,
int_or_none, int_or_none,
js_to_json, js_to_json,
merge_dicts, merge_dicts,
OnDemandPagedList,
parse_duration, parse_duration,
str_or_none, str_or_none,
traverse_obj, traverse_obj,
@ -65,8 +66,8 @@ class PromoDJBaseIE(InfoExtractor):
_PAGES = ['featured', 'shop', *_MEDIA_TYPES] _PAGES = ['featured', 'shop', *_MEDIA_TYPES]
_BASE_URL_RE = r'https?://(?:www\.)?promodj\.com' _BASE_URL_RE = r'https?://(?:www\.)?promodj\.com'
_NOT_LOGIN_LIST = '|'.join(['radio', *_PAGES]) _NOT_LOGIN_LIST = '|'.join(['radio', 'embed', *_PAGES])
_LOGIN_RE = rf'(?!{_NOT_LOGIN_LIST})[\w.-]+' _LOGIN_RE = rf'(?!(?:{_NOT_LOGIN_LIST})(?:/|$))[\w.-]+'
def _set_url_page(self, url, page): def _set_url_page(self, url, page):
parsed_url = urllib.parse.urlparse(url) parsed_url = urllib.parse.urlparse(url)
@ -154,6 +155,8 @@ class PromoDJBaseIE(InfoExtractor):
'format_id': 'lossy', 'format_id': 'lossy',
'url': traverse_obj(source, ('URL', {url_or_none})), 'url': traverse_obj(source, ('URL', {url_or_none})),
'size': traverse_obj(source, ('size', {int_or_none})), 'size': traverse_obj(source, ('size', {int_or_none})),
'acodec': 'mp3',
'vcodec': 'none',
} for source in traverse_obj(media_data, ('sources'))] } for source in traverse_obj(media_data, ('sources'))]
thumbnails = [{ thumbnails = [{
'url': url, 'url': url,
@ -247,6 +250,10 @@ class PromoDJUserIE(PromoDJBaseIE):
'id': 'slim96', 'id': 'slim96',
}, },
'playlist_count': 0, 'playlist_count': 0,
}, {
# login starts with page name
'url': 'https://promodj.com/radio.remix',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -288,6 +295,10 @@ class PromoDJUserMediaIE(PromoDJBaseIE):
'id': 'worobyev-video', 'id': 'worobyev-video',
}, },
'playlist_count': 0, 'playlist_count': 0,
}, {
# login starts with page name
'url': 'https://promodj.com/radio.remix/music',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -378,15 +389,18 @@ class PromoDJUserPageIE(PromoDJBaseIE):
*PromoDJBaseIE._MEDIA_TYPES, *PromoDJBaseIE._MEDIA_TYPES,
] ]
_NOT_USER_PAGE_LIST = '|'.join(_USER_PATHS) _NOT_USER_PAGE_LIST = '|'.join(_USER_PATHS)
_USER_PAGE_RE = rf'(?!{_NOT_USER_PAGE_LIST})[\w-]+'
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<slug>{_USER_PAGE_RE})$' _VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<slug>(?!(?:{_NOT_USER_PAGE_LIST})$)[\w-]+$)'
_TESTS = [{ _TESTS = [{
'url': 'https://promodj.com/djperetse/MaxMixes', 'url': 'https://promodj.com/djperetse/MaxMixes',
'info_dict': { 'info_dict': {
'id': 'djperetse-MaxMixes', 'id': 'djperetse-MaxMixes',
}, },
'playlist_count': 5, 'playlist_count': 5,
}, {
# user page starts with media type (not a real link)
'url': 'https://promodj.com/djperetse/remixes-best',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -768,7 +782,7 @@ class PromoDJIE(PromoDJBaseIE):
# examples: MP3, 320 Кбит | MP4, 20157 Кбит | WAV, 1412 Кбит | AVI, 1731 Кбит | ASF, 6905 Кбит | FLAC, 1509 Кбит # examples: MP3, 320 Кбит | MP4, 20157 Кбит | WAV, 1412 Кбит | AVI, 1731 Кбит | ASF, 6905 Кбит | FLAC, 1509 Кбит
# https://regex101.com/r/2AuaxB/1 # https://regex101.com/r/2AuaxB/1
_FORMATS_RE = r'(?:<a\s+href=\"(?P<url>[^\"]+)\">)?\s*\w+, (?P<bitrate>\d+) Кбит' _FORMATS_RE = r'(?:<a\s+href=\"(?P<url>[^\"]+)\">)?\s*(?P<format>\w+), (?P<bitrate>\d+) Кбит'
_VIEW_COUNT_RE = r'<b>(?:Прослушиваний|Просмотров):</b>\s*(\d+)' _VIEW_COUNT_RE = r'<b>(?:Прослушиваний|Просмотров):</b>\s*(\d+)'
# examples: 0:21 | 1:07 | 74:38 # examples: 0:21 | 1:07 | 74:38
_DURATION_RE = r'<b>Продолжительность:</b>\s*(\d+:\d{2})' _DURATION_RE = r'<b>Продолжительность:</b>\s*(\d+:\d{2})'
@ -835,18 +849,19 @@ class PromoDJIE(PromoDJBaseIE):
# size field describes best quality # size field describes best quality
size = self._parse_ru_size(*re.search(self._SIZE_RE, meta_html).groups()) size = self._parse_ru_size(*re.search(self._SIZE_RE, meta_html).groups())
if type == 'videos': if type == 'videos':
for url, bitrate in formats_from_html: for url, format, bitrate in formats_from_html:
if url_or_none(url): if url_or_none(url):
metadata['formats'].append({ metadata['formats'].append({
'format_id': 'source', 'format_id': 'source',
'url': url, 'url': url,
'tbr': int(bitrate), 'tbr': int(bitrate),
'size': size, 'size': size,
'container': format.lower(),
'quality': 1, 'quality': 1,
}) })
elif not is_paid: elif not is_paid:
for i, match in enumerate(formats_from_html): for i, match in enumerate(formats_from_html):
url, bitrate = match url, format, bitrate = match
is_last = i == len(formats_from_html) - 1 is_last = i == len(formats_from_html) - 1
if is_last: if is_last:
metadata['formats'][0]['abr'] = int(bitrate) metadata['formats'][0]['abr'] = int(bitrate)
@ -855,6 +870,8 @@ class PromoDJIE(PromoDJBaseIE):
'format_id': 'lossless', 'format_id': 'lossless',
'url': url, 'url': url,
'abr': int(bitrate), 'abr': int(bitrate),
'acodec': format.lower(),
'vcodec': 'none',
}) })
metadata['formats'][-1]['size'] = size metadata['formats'][-1]['size'] = size
@ -978,19 +995,55 @@ class PromoDJRadioIE(PromoDJBaseIE):
_VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/radio#(?P<id>\w+)' _VALID_URL = rf'{PromoDJBaseIE._BASE_URL_RE}/radio#(?P<id>\w+)'
_TESTS = [{ _TESTS = [{
'url': 'https://promodj.com/radio#dubstep', 'url': 'https://promodj.com/radio#dubstep',
'only_matching': True, 'info_dict': {
'id': 'dubstep',
'ext': 'mp3',
'title': r're:^Dubstep ',
'description': 'Всё лучше под дабстеп',
'thumbnail': r're:^https?://',
'live_status': 'is_live',
},
}, { }, {
'url': 'https://promodj.com/radio#oldschool', 'url': 'https://promodj.com/radio#oldschool',
'only_matching': True, 'info_dict': {
'id': 'oldschool',
'ext': 'mp3',
'title': r're:^Old-School ',
'description': 'То самое доброе, старое, вечное',
'thumbnail': r're:^https?://',
'live_status': 'is_live',
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
id = self._match_id(url) slug = self._match_id(url)
html = self._download_webpage(url, slug)
radio_span = get_element_html_by_id(f'radio_{slug}', html)
if not radio_span:
raise ExtractorError('Radio channel is offline or not exists', expected=True)
id = self._search_regex(r'amba="radio:(\d+)"', radio_span, 'id')
tooltip_html = self._download_webpage(
f'https://promodj.com/ajax/tooltip.html?wtf=radio:{id}', slug,
note='Downloading tooltip webpage')
title = clean_html(self._search_regex(
r'<h1[^>]*><b>([^<]+)</b></h1>', tooltip_html, 'title', default=None))
description = clean_html(self._search_regex(
r'<div>([^<]+)</div>', tooltip_html, 'description', default=None))
thumbnail = self._search_regex(
rf'#radio_{slug}:after {{ background-image: url\(([^)]+)\); }}',
html, 'thumbnail', default=None)
return { return {
'id': id, 'id': slug,
'title': title,
'description': description,
'thumbnail': url_or_none(thumbnail),
'formats': [{ 'formats': [{
'url': f'https://radio.promodj.com/{id}-192', 'url': f'https://radio.promodj.com/{slug}-192',
'abr': 192, 'abr': 192,
'ext': 'mp3',
'acodec': 'mp3',
'vcodec': 'none',
}], }],
'is_live': True, 'is_live': True,
} }