Compare commits

..

6 Commits

Author SHA1 Message Date
DmitryScaletta
99dec4d6ed
[PromoDJ] Add music format ids 2024-02-15 13:48:13 +03:00
DmitryScaletta
1b3c186424
[PromoDJ] Fix page size for playlists 2024-02-15 12:41:29 +03:00
DmitryScaletta
7e96492ba0
[PromoDJ] Fix page size for playlists 2024-02-15 12:39:46 +03:00
DmitryScaletta
e6f3e6de0e
[PromoDJ] Fix paid music metadata 2024-02-15 12:34:50 +03:00
DmitryScaletta
c837d90e12
[PromoDJ] Add support for user's best media playlist 2024-02-15 12:21:53 +03:00
DmitryScaletta
c820715205
[PromoDJ] Fix parse data and size functions 2024-02-15 11:53:12 +03:00

View File

@ -114,10 +114,6 @@ class PromoDJBaseIE(InfoExtractor):
if YoutubeIE.suitable(iframe_url):
yield self.url_result(iframe_url, YoutubeIE)
def _get_playlist_page_size(self, url):
is_default_playlist = '/groups/' not in url
return 30 if is_default_playlist else 20
def _get_current_page(self, html):
return int(clean_html(get_element_by_class('NavigatorCurrentPage', html)) or '1')
@ -159,10 +155,11 @@ class PromoDJBaseIE(InfoExtractor):
})
}
formats = [traverse_obj(source, {
'url': ('URL', {url_or_none}),
'size': ('size', {int_or_none}),
}) for source in traverse_obj(media_data, ('sources'))]
formats = [{
'format_id': 'lossy',
'url': traverse_obj(source, ('URL', {url_or_none})),
'size': traverse_obj(source, ('size', {int_or_none})),
} for source in traverse_obj(media_data, ('sources'))]
thumbnails = [{
'url': url,
} for url in traverse_obj(media_data, ('coverURL', ('600', '1200', '2000'))) if url_or_none(url)]
@ -382,6 +379,7 @@ class PromoDJUserPageIE(PromoDJBaseIE):
'blog',
'feedback',
'contact',
'uenno',
*PromoDJBaseIE._MEDIA_TYPES,
]
_NOT_USER_PAGE_RE = '|'.join(_USER_PAGES)
@ -447,8 +445,11 @@ class PromoDJBlogPageIE(PromoDJBaseIE):
class PromoDJPlaylistIE(PromoDJBaseIE):
_PLAYLIST_TYPES = ['uenno', *PromoDJBaseIE._MEDIA_TYPES]
_PLAYLIST_TYPES_RE = '|'.join(_PLAYLIST_TYPES)
_VALID_URL = [
rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>{PromoDJBaseIE._MEDIA_TYPES_RE})$',
rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>{_PLAYLIST_TYPES_RE})$',
rf'{PromoDJBaseIE._BASE_URL_RE}/(?P<login>{PromoDJBaseIE._LOGIN_RE})/(?P<type>groups)/(?P<id>\d+)(?:/(?P<slug>\w+))?',
]
_TESTS = [{
@ -507,20 +508,36 @@ class PromoDJPlaylistIE(PromoDJBaseIE):
# 900+ items
'url': 'https://promodj.com/fonarev/groups/17350/Digital_Emotions_Podcast',
'only_matching': True,
}, {
# user's best music and video
'url': 'https://promodj.com/djbaribyn/uenno',
'info_dict': {
'id': 'djbaribyn-uenno',
},
'playlist_count': 15,
'params': {
'playlistend': 15,
}
}]
_ALLOWED_MEDIA_CATS = ['music', 'video']
def _get_page_size(self, type):
if type == 'uenno':
return 15
if type == 'groups':
return 20
return 30
def _real_extract(self, url):
match = self._match_valid_url(url)
login = match.group('login')
type = match.group('type')
playlist_id = f'{login}-{type}' if len(match.groups()) == 2 else f'{login}-{type}-{match.group("id")}'
page_size = self._get_playlist_page_size(url)
entries = OnDemandPagedList(
functools.partial(self._fetch_page, url, self._ALLOWED_MEDIA_CATS, playlist_id),
page_size)
self._get_page_size(type))
return self.playlist_result(entries, playlist_id=playlist_id)
@ -753,7 +770,6 @@ class PromoDJIE(PromoDJBaseIE):
},
}]
_IS_PAID_RE = r'<b>Цена:</b>'
# examples: MP3, 320 Кбит | MP4, 20157 Кбит | WAV, 1412 Кбит | AVI, 1731 Кбит | ASF, 6905 Кбит | FLAC, 1509 Кбит
# https://regex101.com/r/2AuaxB/1
_FORMATS_RE = r'(?:<a\s+href=\"(?P<url>[^\"]+)\">)?\s*\w+, (?P<bitrate>\d+) Кбит'
@ -761,7 +777,7 @@ class PromoDJIE(PromoDJBaseIE):
# examples: 0:21 | 1:07 | 74:38
_DURATION_RE = r'<b>Продолжительность:</b>\s*(\d+:\d{2})'
# examples: 818.4 Кб | 12.9 Мб | 4 Гб | 1.76 Гб | 1001.5 Мб
_SIZE_RE = r'<b>Размер:</b>\s*(?P<size>\d+(?:\.\d+)?)\s*(?P<unit>Кб|Мб|Гб)'
_SIZE_RE = r'<b>Размер:</b>\s*(?P<size>\d+(?:\.\d+)?)\s*(?P<unit>Б|Кб|Мбб|Тб)'
# examples: сегодня 2:55 | вчера 23:17 | 1 июня 2016 3:46
_TIMESTAMP_RE = r'<b>Публикация:</b>\s*(?P<day>вчера|сегодня|\d{1,2})(?: (?P<month>[а-я]+) (?P<year>\d{4}))?\s*(?P<hours>\d{1,2}):(?P<minutes>\d{2})'
_TAGS_RE = r'<span\s+class=\"styles\">([^\n]+)</span>'
@ -771,9 +787,8 @@ class PromoDJIE(PromoDJBaseIE):
# https://regex101.com/r/b9utBf/1
_VIDEO_DATA_REGEX = r'({\"video\":true,\"config\":[^\n]+)\);'
def _parse_ru_date(self, raw_date):
def _parse_ru_date(self, day, month, year, hours, minutes):
RU_MONTHS = ['января', 'февраля', 'марта', 'апреля', 'мая', 'июня', 'июля', 'августа', 'сентября', 'октября', 'ноября', 'декабря']
day, month, year, hours, minutes = raw_date
if day == 'сегодня':
d = datetime.date.today()
day = d.day
@ -790,10 +805,9 @@ class PromoDJIE(PromoDJBaseIE):
year = int(year)
return datetime.datetime(year, month, day, int(hours), int(minutes)).timestamp()
def _parse_ru_size(self, raw_size):
RU_SIZE_UNITS = ['Б', 'Кб', 'Мб', 'Гб']
size, size_unit = raw_size
return int(float(size) * pow(1024, RU_SIZE_UNITS.index(size_unit)))
def _parse_ru_size(self, size, unit):
RU_SIZE_UNITS = ['Б', 'Кб', 'Мб', 'Гб', 'Тб']
return int(float(size) * pow(1024, RU_SIZE_UNITS.index(unit)))
# music: always have lossy format (mp3), sometimes have lossless (wav or flac) format
# video: sometimes have source format (mp4, avi, asf), always have converted for web format (mp4)
@ -821,9 +835,9 @@ class PromoDJIE(PromoDJBaseIE):
# download links can be missing
# best quality format always comes first
formats_from_html = re.findall(self._FORMATS_RE, meta_html)
is_paid = re.search(self._IS_PAID_RE, meta_html)
is_paid = '<b>Цена:</b>' in meta_html
# size field describes best quality
size = self._parse_ru_size(re.search(self._SIZE_RE, meta_html).groups())
size = self._parse_ru_size(*re.search(self._SIZE_RE, meta_html).groups())
if type == 'videos':
for url, bitrate in formats_from_html:
if url_or_none(url):
@ -834,14 +848,15 @@ class PromoDJIE(PromoDJBaseIE):
'size': size,
'quality': 1,
})
else:
elif not is_paid:
for i, match in enumerate(formats_from_html):
url, bitrate = match
is_last = i == len(formats_from_html) - 1
if is_last:
metadata['formats'][0]['abr'] = int(bitrate)
elif url_or_none(url) and not is_paid:
elif url_or_none(url):
metadata['formats'].append({
'format_id': 'lossless',
'url': url,
'abr': int(bitrate),
})
@ -851,7 +866,7 @@ class PromoDJIE(PromoDJBaseIE):
'title': clean_html(get_element_by_class('file_title', html)),
'view_count': int_or_none(self._search_regex(self._VIEW_COUNT_RE, meta_html, 'view_count', default=None)),
'duration': parse_duration(self._search_regex(self._DURATION_RE, meta_html, 'duration')),
'timestamp': self._parse_ru_date(re.search(self._TIMESTAMP_RE, meta_html).groups()),
'timestamp': self._parse_ru_date(*re.search(self._TIMESTAMP_RE, meta_html).groups()),
'tags': self._html_search_regex(self._TAGS_RE, meta_html, 'tags').split(', '),
})