Compare commits

...

5 Commits

Author SHA1 Message Date
red-acid
0c958779a7
Merge 049862b803 into eb15fd5a32 2024-11-17 17:03:36 +01:00
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
red-acid
049862b803
Apply suggestions from code review
Co-authored-by: N/Ame <173015200+grqz@users.noreply.github.com>
2024-10-23 11:11:24 +01:00
red-acid
c2b5c7025c
Update rtp.py 2024-10-14 12:36:35 +01:00
4 changed files with 299 additions and 43 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

View File

@ -4,40 +4,97 @@ import re
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import js_to_json from ..utils import (
ExtractorError,
determine_ext,
join_nonempty,
js_to_json,
)
class RTPIE(InfoExtractor): class RTPIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/(?:(?:estudoemcasa|palco|zigzag)/)?p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:(?:www\.)?rtp\.pt/play/(?P<subarea>.*/)?p(?P<program_id>\d+)/|arquivos\.rtp\.pt/conteudos/)(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas', 'url': 'https://www.rtp.pt/play/p9165/e562949/por-do-sol',
'md5': 'e736ce0c665e459ddb818546220b4ef8',
'info_dict': { 'info_dict': {
'id': 'e174042', 'id': 'e562949',
'ext': 'mp3',
'title': 'Paixões Cruzadas',
'description': 'As paixões musicais de António Cartaxo e António Macedo',
'thumbnail': r're:^https?://.*\.jpg',
},
}, {
'url': 'https://www.rtp.pt/play/zigzag/p13166/e757904/25-curiosidades-25-de-abril',
'md5': '9a81ed53f2b2197cfa7ed455b12f8ade',
'info_dict': {
'id': 'e757904',
'ext': 'mp4', 'ext': 'mp4',
'title': '25 Curiosidades, 25 de Abril', 'title': 'Pôr do Sol Episódio 1',
'description': 'Estudar ou não estudar - Em cada um dos episódios descobrimos uma curiosidade acerca de como era viver em Portugal antes da revolução do 25 de abr', 'description': 'Madalena Bourbon de Linhaça vive atormentada pelo segredo que esconde desde 1990. Matilde Bourbon de Linhaça sonha fugir com o seu amor proibido. O',
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:https?://.*\.(?:jpg|png)',
}, },
}, { }, {
'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'url': 'https://www.rtp.pt/play/p12646/e738493/telejornal',
'only_matching': True, 'info_dict': {
'id': 'e738493',
'ext': 'mp4',
'title': 'Telejornal de 01 jan 2024 PARTE 1',
'description': 'A mais rigorosa seleção de notícias, todos os dias às 20h00. De segunda a domingo, João Adelino Faria, José Rodrigues dos Santos e Ana Lourenço',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}, { }, {
'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/portugues-1-ano', 'url': 'https://www.rtp.pt/play/p6646/e457262/grande-entrevista',
'only_matching': True, 'info_dict': {
'id': 'e457262',
'ext': 'mp4',
'title': 'Grande Entrevista Episódio 7 - de 19 fev 2020',
'description': 'Bruno Nogueira - É um dos mais originais humoristas portugueses e de maior êxito! Bruno Nogueira na Grande Entrevista com Vítor Gonçalves.',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}, { }, {
'url': 'https://www.rtp.pt/play/palco/p13785/l7nnon', 'url': 'https://www.rtp.pt/play/p1525/e738522/a-mosca',
'only_matching': True, 'info_dict': {
'id': 'e738522',
'ext': 'mp4',
'title': 'A Mosca de 02 jan 2024',
'description': 'Ano novo, vida nova - Ano novo, vida nova',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}, {
'url': 'https://www.rtp.pt/play/estudoemcasa/p7776/e539826/portugues-1-ano',
'info_dict': {
'id': 'e539826',
'ext': 'mp4',
'title': 'Português - 1.º ano , aula 45 - 27 abr 2021',
'description': 'A História do Pedrito Coelho, de Beatrix Potter. O dígrafo \'lh\' - A História do Pedrito Coelho, de Beatrix Potter. O dígrafo \'lh\'.',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}, {
'url': 'https://www.rtp.pt/play/zigzag/p13857/e794575/zig-zag-zzz-e-amigos',
'info_dict': {
'id': 'e794575',
'ext': 'mp4',
'title': 'Zig, Zag, Zzz e Amigos Episódio 1 - de 16 set 2024',
'description': 'O Brinquedo Perdido - Zig, Zag e Zzz são três amigos inseparáveis que partilham aventuras emocionantes e cheias de imaginação. Exploram o mundo <20>',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}, {
'url': 'https://www.rtp.pt/play/palco/p13151/premio-miguel-rovisco-2023-requiem-por-isabel',
'info_dict': {
'id': 'premio-miguel-rovisco-2023-requiem-por-isabel',
'ext': 'mp4',
'title': 'Prémio Miguel Rovisco 23: Requiem Por Isabel de 30 mar 2024',
'description': 'Lucrécia foi a atriz mais famosa e requisitada do seu tempo. Este já não é o seu tempo. A debater-se com a decrepitude física e financeira, foi o',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}, {
'url': 'https://arquivos.rtp.pt/conteudos/liga-dos-ultimos-152/',
'info_dict': {
'id': 'liga-dos-ultimos-152',
'ext': 'mp4',
'title': 'Liga dos Últimos RTP Arquivos',
'description': 'Magazine desportivo, com apresentação de Álvaro Costa e comentários em estúdio do professor Hernâni Gonçalves e do sociólogo João Nuno Coelho. Destaque para os jogos de futebol das equipas dos escalões secundários de Portugal, com momentos dos jogos: Agrário de Lamas vs Pampilhoense e Apúlia vs Fragoso.',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}, {
'url': 'https://www.rtp.pt/play/p510/e786608/aleixo-fm',
'info_dict': {
'id': 'e786608',
'ext': 'mp3',
'title': 'Aleixo FM de 31 jul 2024',
'description': 'Melhor dia pra casar - Já o diz Joaquim de Magalhães Fernandes Barreiros, comummente conhecido como Quim Barreiros. Mas será mesmo este o melhor di',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
}] }]
_RX_OBFUSCATION = re.compile(r'''(?xs) _RX_OBFUSCATION = re.compile(r'''(?xs)
@ -60,33 +117,69 @@ class RTPIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = self._html_search_meta(
'twitter:title', webpage, display_name='title', fatal=True)
# Title tag includes relevant data
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title', default='')
# Raise error if episode is unavailable
if 'Este episódio não se encontra disponível' in title:
raise ExtractorError('Episode unavailable', expected=True)
# Replace irrelevant text in title
title = re.sub(r' - ?(RTP Play|Estudo Em Casa|Zig Zag Play|RTP Palco)( - RTP)?', '', title)
# Check if it's a episode split in parts
part = self._html_search_regex(r'section\-parts.*<span.*>(.+?)</span>.*</ul>', webpage, 'part', default=None)
# Add episode part identification to title if it exists
title = join_nonempty(title, part, delim=' ')
# Extract f and config from page
f, config = self._search_regex( f, config = self._search_regex(
r'''(?sx) r'''(?sx)
(?:var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*)? (?:var\s+f\s*=\s*(?P<f>".*?"|{[^;]+?});\s*)?
var\s+player1\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/) var\s+player1?\s+=\s+new\s+RTPPlayer\s*\((?P<config>{(?:(?!\*/).)+?})\);(?!\s*\*/)
''', webpage, ''', webpage,
'player config', group=('f', 'config')) 'player config', group=('f', 'config'))
config = self._parse_json( config = self._parse_json(
config, video_id, config, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id)) lambda data: self.__unobfuscate(data, video_id=video_id))
# Estudo em Casa / Zig Zag / Palco / RTP Arquivos subareas don't include f
f = config['file'] if not f else self._parse_json( f = config['file'] if not f else self._parse_json(
f, video_id, f, video_id,
lambda data: self.__unobfuscate(data, video_id=video_id)) lambda data: self.__unobfuscate(data, video_id=video_id))
formats = [] formats = []
if isinstance(f, dict): if isinstance(f, dict):
f_hls = f.get('hls') file_hls = f.get('hls')
if f_hls is not None: file_fps = f.get('fps')
formats.extend(self._extract_m3u8_formats(
f_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')) if file_fps is not None:
# RTP Arquivos specific use case
if '/arquivo/' in file_fps:
file_key = config['fileKey']
split_file_key = file_key.split('/')
filename = split_file_key[-1]
del split_file_key[-1]
split_file_key.extend([f'index.m3u8?tlm=hls&streams={filename}.m3u8'])
path = '/'.join(split_file_key)
file_hls = f'https://streaming-arquivo-ondemand.rtp.pt/nas2.share{path}'
elif file_hls is None:
file_hls = file_fps.replace('drm-fps', 'hls')
formats.extend(self._extract_m3u8_formats(
file_hls, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
else:
ext = determine_ext(f)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
f, video_id, 'mp4', 'm3u8_native', m3u8_id='hls'))
f_dash = f.get('dash')
if f_dash is not None:
formats.extend(self._extract_mpd_formats(f_dash, video_id, mpd_id='dash'))
else: else:
formats.append({ formats.append({
'format_id': 'f', 'format_id': 'f',
@ -95,7 +188,6 @@ class RTPIE(InfoExtractor):
}) })
subtitles = {} subtitles = {}
vtt = config.get('vtt') vtt = config.get('vtt')
if vtt is not None: if vtt is not None:
for lcode, lname, url in vtt: for lcode, lname, url in vtt:
@ -108,7 +200,7 @@ class RTPIE(InfoExtractor):
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'formats': formats, 'formats': formats,
'description': self._html_search_meta(['description', 'twitter:description'], webpage), 'description': self._html_search_meta(['og:description', 'description'], webpage),
'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage), 'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
'subtitles': subtitles, 'subtitles': subtitles,
} }