Compare commits

..

4 Commits

Author SHA1 Message Date
SirElderling
a4dc13ccc3
Update yt_dlp/extractor/radiocomercial.py
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
2023-11-05 21:35:52 +00:00
SirElderling
b8445e27b7
Update yt_dlp/extractor/radiocomercial.py
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
2023-11-05 21:35:07 +00:00
SirElderling
473860d97d [RadioComercial] Add extractor - add skip to test with inconsistent md5 2023-11-05 20:56:42 +00:00
SirElderling
c24e8e39e5 [RadioComercial] Add extractor - more review recommendations 2023-11-05 20:49:59 +00:00

View File

@ -17,15 +17,16 @@ from ..utils.traversal import traverse_obj
class RadioComercialIE(InfoExtractor): class RadioComercialIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/]+/\w(?P<season>\d+)/(?P<id>[-\w]+)/*$' _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/]+/\D*(?P<season>\d+)/(?P<id>[\w-]+)'
_TESTS = [{ _TESTS = [
{
'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas', 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4', 'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
'info_dict': { 'info_dict': {
'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas', 'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.', 'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
'description': '', 'description': None,
'release_date': '20231025', 'release_date': '20231025',
'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
'season': 6 'season': 6
@ -38,7 +39,7 @@ class RadioComercialIE(InfoExtractor):
'id': 'convenca-me-num-minuto-que-os-lobisomens-existem', 'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Convença-me num minuto que os lobisomens existem', 'title': 'Convença-me num minuto que os lobisomens existem',
'description': '', 'description': None,
'release_date': '20231026', 'release_date': '20231026',
'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
'season': 3 'season': 3
@ -55,6 +56,23 @@ class RadioComercialIE(InfoExtractor):
'release_date': '20231101', 'release_date': '20231101',
'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg', 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
'season': 2 'season': 2
},
'params': {
# inconsistant md5
'skip_download': True,
},
},
{
'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro',
'md5': '91d32d4d4b1407272068b102730fc9fa',
'info_dict': {
'id': 't-n-t-29-de-outubro',
'ext': 'mp3',
'title': 'T.N.T 29 de outubro',
'description': None,
'release_date': '20231029',
'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
'season': 2023
} }
}, },
] ]
@ -65,7 +83,7 @@ class RadioComercialIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'title': self._html_extract_title(webpage), 'title': self._html_extract_title(webpage),
'description': self._og_search_description(webpage, default=''), 'description': self._og_search_description(webpage, default=None),
'release_date': unified_strdate( 'release_date': unified_strdate(
get_element_by_class('date', get_element_html_by_class('descriptions', webpage))), get_element_by_class('date', get_element_html_by_class('descriptions', webpage))),
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
@ -75,49 +93,52 @@ class RadioComercialIE(InfoExtractor):
class RadioComercialPlaylistIE(InfoExtractor): class RadioComercialPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[-\w]+)[/\w\d+]*$' _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:\D*(?P<season>\d+))?'
_PAGE_SIZE = 19 _PAGE_SIZE = 19
_TESTS = [{ _TESTS = [
{
'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3', 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
'info_dict': { 'info_dict': {
'id': 'convenca-me-num-minuto', 'id': 'convenca-me-num-minuto',
'title': 'Convença-me num Minuto - Temporada 3', 'title': 'Convença-me num Minuto - Temporada 3',
}, },
'playlist_mincount': 32 'playlist_mincount': 32
}, { },
{
'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao', 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao',
'info_dict': { 'info_dict': {
'id': 'o-homem-que-mordeu-o-cao', 'id': 'o-homem-que-mordeu-o-cao',
'title': 'O Homem Que Mordeu o Cão', 'title': 'O Homem Que Mordeu o Cão',
}, },
'playlist_mincount': 19 'playlist_mincount': 19
}, { },
{
'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas', 'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas',
'info_dict': { 'info_dict': {
'id': 'as-minhas-coisas-favoritas', 'id': 'as-minhas-coisas-favoritas',
'title': 'As Minhas Coisas Favoritas', 'title': 'As Minhas Coisas Favoritas',
}, },
'playlist_mincount': 131 'playlist_mincount': 131
}, }
] ]
def _fetch_page(self, url, season, page): def _fetch_page(self, podcast, season, page):
page += 1 page += 1
next_page = f'{url}{"/" + str(page) if page > 1 else ""}' url = f'https://radiocomercial.pt/podcasts/{podcast}' + (f'/t{season}' if season else '') + f'/{page}'
webpage = self._download_webpage(next_page, season, headers={'X-Requested-With': 'XMLHttpRequest'}, playlist_id = join_nonempty(podcast, season, delim='_')
note=f'Downloading page: {next_page}') webpage = self._download_webpage(url, playlist_id, note=f'Downloading page: {page}')
episodes = set(traverse_obj(get_elements_html_by_class('tm-ouvir-podcast', webpage), episodes = set(traverse_obj(get_elements_html_by_class('tm-ouvir-podcast', webpage),
(..., {extract_attributes}, 'href'))) (..., {extract_attributes}, 'href')))
for entry in episodes: for entry in episodes:
yield self.url_result(f'https://radiocomercial.pt{entry}', RadioComercialIE) yield self.url_result(f'https://radiocomercial.pt{entry}', RadioComercialIE)
def _real_extract(self, url): def _real_extract(self, url):
podcast = self._match_id(url) podcast, season = self._match_valid_url(url).group('id', 'season')
webpage = self._download_webpage(url, podcast) webpage = self._download_webpage(url, podcast)
name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0]) name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
season = self._html_extract_title(webpage) title = name if name == season else join_nonempty(name, season, delim=' - Temporada ')
title = name if name == season else join_nonempty(name, season, delim=' - ')
return self.playlist_result(OnDemandPagedList(functools.partial(self._fetch_page, url, season), return self.playlist_result(OnDemandPagedList(functools.partial(self._fetch_page, podcast, season),
self._PAGE_SIZE), podcast, title) self._PAGE_SIZE), podcast, title)