mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-09-22 10:21:24 +02:00
Compare commits
No commits in common. "e0c3428bcba877f9ec7c26eb8919ae0e3630e965" and "bd9904cab6de07227382024983d7355a04256208" have entirely different histories.
e0c3428bcb
...
bd9904cab6
|
@ -23,7 +23,6 @@ from .youtube import ( # Youtube is moved to the top to improve performance
|
||||||
YoutubeShortsAudioPivotIE,
|
YoutubeShortsAudioPivotIE,
|
||||||
YoutubeConsentRedirectIE,
|
YoutubeConsentRedirectIE,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .abc import (
|
from .abc import (
|
||||||
ABCIE,
|
ABCIE,
|
||||||
ABCIViewIE,
|
ABCIViewIE,
|
||||||
|
|
|
@ -1,23 +1,36 @@
|
||||||
import functools
|
import re
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
OnDemandPagedList,
|
|
||||||
extract_attributes,
|
|
||||||
get_element_by_class,
|
|
||||||
get_element_html_by_class,
|
|
||||||
get_element_text_and_html_by_tag,
|
|
||||||
get_elements_html_by_class,
|
|
||||||
int_or_none,
|
int_or_none,
|
||||||
join_nonempty,
|
|
||||||
try_call,
|
|
||||||
unified_strdate,
|
|
||||||
)
|
)
|
||||||
from ..utils.traversal import traverse_obj
|
|
||||||
|
|
||||||
|
|
||||||
class RadioComercialIE(InfoExtractor):
|
class RadioComercialBaseExtractor(InfoExtractor):
|
||||||
_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/]+/\w(?P<season>\d+)/(?P<id>[-\w]+)/*$'
|
def _extract_page_content(self, url):
|
||||||
|
video_id = RadioComercialIE._match_id(url)
|
||||||
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
|
||||||
|
url = self._html_search_regex(r'<a.+?isExclusivePlay=.+?href="(.+?)">', webpage, 'url')
|
||||||
|
date = self._html_search_regex(r'<div[^"]+"date">(\d{4}-\d{2}-\d{2})</div>', webpage, 'date')
|
||||||
|
thumbnail = self._html_search_regex(r'<source[^"]+"image/jpeg[^/]+(.+?)\">', webpage, 'thumbnail')
|
||||||
|
season = int_or_none(self._html_search_regex(r'<h2>\w+\s(\d+)</h2>', webpage, 'season'))
|
||||||
|
episode_id = int_or_none(self._html_search_regex(r'episodeid=(\d+)&', url, 'episode_id'))
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'title': title,
|
||||||
|
'date': date,
|
||||||
|
'thumbnail': thumbnail,
|
||||||
|
'season': season,
|
||||||
|
'episode_id': episode_id,
|
||||||
|
'url': url
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class RadioComercialIE(RadioComercialBaseExtractor):
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/]+/\w\d+/(?P<id>[-\w]+)/*$'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
|
'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
|
||||||
'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
|
'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
|
||||||
|
@ -25,10 +38,10 @@ class RadioComercialIE(InfoExtractor):
|
||||||
'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
|
'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
|
'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
|
||||||
'description': '',
|
'date': '2023-10-25',
|
||||||
'release_date': '20231025',
|
'thumbnail': r're:/upload/[^.]+.jpg',
|
||||||
'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
|
'season': 6,
|
||||||
'season': 6
|
'episode_id': 220899
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -38,45 +51,20 @@ class RadioComercialIE(InfoExtractor):
|
||||||
'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
|
'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
'title': 'Convença-me num minuto que os lobisomens existem',
|
'title': 'Convença-me num minuto que os lobisomens existem',
|
||||||
'description': '',
|
'date': '2023-10-26',
|
||||||
'release_date': '20231026',
|
'thumbnail': r're:/upload/[^.]+.jpg',
|
||||||
'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
|
'season': 3,
|
||||||
'season': 3
|
'episode_id': 221210
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao',
|
|
||||||
'md5': '69be64255420fec23b7259955d771e54',
|
|
||||||
'info_dict': {
|
|
||||||
'id': 'o-desastre-de-aviao',
|
|
||||||
'ext': 'mp3',
|
|
||||||
'title': 'O desastre de avião',
|
|
||||||
'description': 'md5:8a82beeb372641614772baab7246245f',
|
|
||||||
'release_date': '20231101',
|
|
||||||
'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
|
|
||||||
'season': 2
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id, season = self._match_valid_url(url).group('id', 'season')
|
return self._extract_page_content(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
|
||||||
return {
|
|
||||||
'id': video_id,
|
|
||||||
'title': self._html_extract_title(webpage),
|
|
||||||
'description': self._og_search_description(webpage, default=''),
|
|
||||||
'release_date': unified_strdate(
|
|
||||||
get_element_by_class('date', get_element_html_by_class('descriptions', webpage))),
|
|
||||||
'thumbnail': self._og_search_thumbnail(webpage),
|
|
||||||
'season': int_or_none(season),
|
|
||||||
'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class RadioComercialPlaylistIE(InfoExtractor):
|
class RadioComercialPlaylistIE(RadioComercialBaseExtractor):
|
||||||
_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[-\w]+)[/\w\d+]*$'
|
_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[-\w]+)[/\w\d+]*$'
|
||||||
_PAGE_SIZE = 19
|
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
|
'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
|
@ -97,27 +85,57 @@ class RadioComercialPlaylistIE(InfoExtractor):
|
||||||
'id': 'as-minhas-coisas-favoritas',
|
'id': 'as-minhas-coisas-favoritas',
|
||||||
'title': 'As Minhas Coisas Favoritas',
|
'title': 'As Minhas Coisas Favoritas',
|
||||||
},
|
},
|
||||||
'playlist_mincount': 131
|
'playlist_mincount': 100
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def _fetch_page(self, url, season, page):
|
NextPage = namedtuple('NextPage', ['path', 'page', 'add_one'])
|
||||||
page += 1
|
|
||||||
next_page = f'{url}{"/" + str(page) if page > 1 else ""}'
|
def _extract_next_url_details(self, source):
|
||||||
webpage = self._download_webpage(next_page, season, headers={'X-Requested-With': 'XMLHttpRequest'},
|
regex = re.compile(
|
||||||
note=f'Downloading page: {next_page}')
|
r'\sclass="pagination__next"\shref="(?P<path>/podcasts/[^/]+?[/\w\d+/]+?/)(?P<page>\d+)/*(?P<add_one>\d*)')
|
||||||
episodes = set(traverse_obj(get_elements_html_by_class('tm-ouvir-podcast', webpage),
|
match = regex.search(source)
|
||||||
(..., {extract_attributes}, 'href')))
|
if match:
|
||||||
for entry in episodes:
|
return self.NextPage(match.group('path'),
|
||||||
yield self.url_result(f'https://radiocomercial.pt{entry}', RadioComercialIE)
|
int_or_none(match.group('page')), int_or_none(match.group('add_one')))
|
||||||
|
return self.NextPage(None, None, None)
|
||||||
|
|
||||||
|
def _get_next_page(self, webpage):
|
||||||
|
next_page = self._extract_next_url_details(webpage)
|
||||||
|
if not next_page.path or not next_page.page:
|
||||||
|
return None
|
||||||
|
number_section = f'{next_page.page if not next_page.add_one else next_page.page + 1}'
|
||||||
|
next_page = f'https://radiocomercial.pt{next_page.path}{number_section}'
|
||||||
|
video_id = self._match_id(next_page)
|
||||||
|
return self._download_webpage(next_page, video_id, headers={'X-Requested-With': 'XMLHttpRequest'})
|
||||||
|
|
||||||
|
def _collect_hrefs(self, webpage):
|
||||||
|
regex = re.compile(r'rounded-site-bottom"><a class="tm-ouvir-podcast" href="([^"]+)"')
|
||||||
|
matches = regex.finditer(webpage)
|
||||||
|
for match in matches:
|
||||||
|
yield f'https://radiocomercial.pt{match.group(1)}'
|
||||||
|
|
||||||
|
def _generate_sorted_entries(self, list_of_podcasts):
|
||||||
|
entries = [self._extract_page_content(item) for item in list_of_podcasts]
|
||||||
|
sorted_entries = sorted(entries, key=lambda x: x['date'], reverse=True)
|
||||||
|
for entry in sorted_entries:
|
||||||
|
yield entry
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
podcast = self._match_id(url)
|
podcast = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, podcast)
|
webpage = self._download_webpage(url, podcast)
|
||||||
|
|
||||||
name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
|
podcast_name = self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'name')
|
||||||
season = self._html_extract_title(webpage)
|
podcast_season = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'season')
|
||||||
title = name if name == season else join_nonempty(name, season, delim=' - ')
|
podcast_title = podcast_name if podcast_name == podcast_season else f'{podcast_name} - {podcast_season}'
|
||||||
|
|
||||||
return self.playlist_result(OnDemandPagedList(functools.partial(self._fetch_page, url, season),
|
list_of_podcasts = set()
|
||||||
self._PAGE_SIZE), podcast, title)
|
while True:
|
||||||
|
get_entries = self._collect_hrefs(webpage)
|
||||||
|
if get_entries:
|
||||||
|
list_of_podcasts.update(get_entries)
|
||||||
|
webpage = self._get_next_page(webpage)
|
||||||
|
if not webpage:
|
||||||
|
break
|
||||||
|
|
||||||
|
return self.playlist_result(self._generate_sorted_entries(list_of_podcasts), podcast, podcast_title)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user