Compare commits

...

6 Commits

Author SHA1 Message Date
Mozi
367ed5dc17
Merge a9fc46dc7c into eb15fd5a32 2024-11-17 17:03:36 +01:00
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
Mozi
a9fc46dc7c use 'filter' in traversal 2024-11-16 07:04:10 +00:00
Mozi
a9aa5500a5 merge 'master' 2024-11-16 06:59:12 +00:00
Mozi
513d4c358e [ie/RTS] Support new URLs; fix tests; drop old useless formats 2024-09-16 19:07:46 +00:00
5 changed files with 382 additions and 141 deletions

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

View File

@ -1,182 +1,290 @@
import re import functools
from .srgssr import SRGSSRIE from .srgssr import SRGSSRIE
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none, int_or_none,
parse_duration, orderedSet,
parse_iso8601, parse_iso8601,
unescapeHTML, parse_resolution,
urljoin,
) )
from ..utils.traversal import traverse_obj
class RTSIE(SRGSSRIE): # XXX: Do not subclass from concrete IE class RTSIE(SRGSSRIE):
_WORKING = False _GEO_COUNTRIES = ['CH']
IE_DESC = 'RTS.ch' IE_DESC = 'RTS.ch'
_VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' _VALID_URL = [
r'rts:(?P<id>\d+)',
r'https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html',
r'https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<display_id>.+?)-(?P<id>[0-9]+)\.html',
]
_TESTS = [ _TESTS = [
{ {
# article with videos
'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
'md5': '753b877968ad8afaeddccc374d4256a5',
'info_dict': { 'info_dict': {
'id': '3449373', 'id': '3449373',
'display_id': 'les-enfants-terribles',
'ext': 'mp4',
'duration': 1488,
'title': 'Les Enfants Terribles', 'title': 'Les Enfants Terribles',
'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.', 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
'uploader': 'Divers', 'display_id': 'les-enfants-terribles',
'upload_date': '19680921', 'tags': ['Divers', 'Archives TV', 'Culture et Arts', 'Les archives', 'Personnalités', 'RTS Archives', 'Années 1960', 'Autres arts', 'Décennies', 'Société'],
'timestamp': -40280400,
'thumbnail': r're:^https?://.*\.image',
'view_count': int,
}, },
'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'playlist': [{
}, 'info_dict': {
{ 'id': '3449373',
'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', 'ext': 'mp4',
'info_dict': { 'title': 'Les Enfants Terribles',
'id': '5624065', 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
'title': 'Passe-moi les jumelles', 'thumbnail': r're:^https?://.*\.image',
}, 'upload_date': '19680921',
'playlist_mincount': 4, 'timestamp': -40280400,
'duration': 1488,
'categories': ['Divers'],
},
}],
'params': {'skip_download': 'm3u8'}, # 700-byte first fragment
}, },
{ {
# video without text content
'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html',
'info_dict': { 'info_dict': {
'id': '5745975', 'id': '5745975',
'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski',
'ext': 'mp4',
'duration': 48,
'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski', 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
'description': 'Hockey - Playoff', 'description': 'Hockey - Playoff',
'uploader': 'Hockey', 'tags': ['Hockey', 'Sport', 'RTS Sport'],
'upload_date': '20140403',
'timestamp': 1396556882,
'thumbnail': r're:^https?://.*\.image',
'view_count': int,
}, },
'params': { 'playlist': [{
# m3u8 download 'info_dict': {
'skip_download': True, 'id': '5745975',
}, 'ext': 'mp4',
'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
'description': 'Hockey - Playoff',
'thumbnail': r're:^https?://.*\.image',
'upload_date': '20140403',
'timestamp': 1396556882,
'duration': 48,
'categories': ['Hockey sur glace'],
},
}],
'params': {'skip_download': 'm3u8'}, # 700-byte first fragment
'skip': 'Blocked outside Switzerland', 'skip': 'Blocked outside Switzerland',
}, },
{ {
# video player; redirection: https://www.rts.ch/play/tv/lactu-en-video/video/londres-cachee-par-un-epais-smog?urn=urn:rts:video:5745356
'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html',
'md5': '9bb06503773c07ce83d3cbd793cebb91',
'info_dict': { 'info_dict': {
'id': '5745356', 'id': '5745356',
'display_id': 'londres-cachee-par-un-epais-smog',
'ext': 'mp4', 'ext': 'mp4',
'duration': 33, 'duration': 33.76,
'title': 'Londres cachée par un épais smog', 'title': 'Londres cachée par un épais smog',
'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.',
'uploader': 'L\'actu en vidéo',
'upload_date': '20140403', 'upload_date': '20140403',
'timestamp': 1396537322, 'timestamp': 1396537322,
'thumbnail': r're:^https?://.*\.image', 'thumbnail': r're:^https?://.*\.image',
'view_count': int, 'webpage_url': 'srgssr:rts:video:5745356',
}, },
'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'params': {'skip_download': 'm3u8'}, # 700-byte first fragment
}, },
{ {
# audio & podcast
'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html',
'md5': 'dd8ef6a22dff163d063e2a52bc8adcae',
'info_dict': { 'info_dict': {
'id': '5706148', 'id': '5706148',
'display_id': 'urban-hippie-de-damien-krisl-03-04-2014',
'ext': 'mp3',
'duration': 123,
'title': '"Urban Hippie", de Damien Krisl', 'title': '"Urban Hippie", de Damien Krisl',
'description': 'Des Hippies super glam.', 'description': 'Des Hippies super glam.',
'upload_date': '20140403', 'display_id': 'urban-hippie-de-damien-krisl',
'timestamp': 1396551600, 'tags': ['Media Radio', 'Couleur3'],
}, },
'playlist': [{
'info_dict': {
'id': '5706148',
'ext': 'mp3',
'title': '"Urban Hippie", de Damien Krisl',
'description': 'Des Hippies super glam.',
'thumbnail': r're:^https?://.*\.image',
'upload_date': '20140403',
'timestamp': 1396546481,
'duration': 123,
'categories': ['La belle vidéo de Stéphane Laurenceau'],
},
}, {
'info_dict': {
'id': '5747185',
'ext': 'mp3',
'title': 'Le musée du psychédélisme',
'description': 'md5:72f8662f48c32050ae817e3bde7e0acc',
'thumbnail': r're:^https?://.*\.image',
'upload_date': '20140402',
'timestamp': 1396476000,
'duration': 274,
'categories': ['Happy Culture'],
},
}, {
'info_dict': {
'id': '5706149',
'ext': 'mp3',
'title': 'Silk Art Hippie Culture',
'description': 'md5:8e3b9d8d84d85ca8a1905cf50b39bba4',
'thumbnail': r're:^https?://.*\.image',
'upload_date': '20140403',
'timestamp': 1396545649,
'duration': 161,
'categories': ['Happy Pics'],
},
}, {
'info_dict': {
'id': '5706148',
'ext': 'mp3',
'title': '"Urban Hippie", de Damien Krisl',
'description': 'Des Hippies super glam.',
'thumbnail': r're:^https?://.*\.image',
'upload_date': '20140403',
'timestamp': 1396546481,
'duration': 123,
'categories': ['La belle vidéo de Stéphane Laurenceau'],
},
}],
}, },
{ {
# article with videos on rhs # article with videos on rhs
'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html',
'info_dict': { 'info_dict': {
'id': '6693917', 'id': '6693917',
'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', 'title': 'Davos décroche le 31e titre de son histoire',
'description': 'md5:3c9a767b2a332413eda33c526024578c',
'display_id': 'hockey-davos-decroche-son-31e-titre-de-champion-de-suisse',
'tags': ['Hockey', 'Tout le sport', 'RTS Info', 'LNA', "Toute l'info", 'RTS Sport'],
}, },
'playlist_mincount': 5, 'playlist_mincount': 5,
'skip': 'Blocked outside Switzerland',
},
{
# articles containing recordings of TV shows
'url': 'https://www.rts.ch/info/regions/valais/12865814-un-bouquetin-emporte-par-un-aigle-royal-sur-les-hauts-de-fully-vs.html',
'info_dict': {
'id': '12865814',
'title': 'Un bouquetin emporté par un aigle royal sur les hauts de Fully (VS)',
'description': 'md5:9b511f89075e2730bd2dd59915c25574',
'display_id': 'un-bouquetin-emporte-par-un-aigle-royal-sur-les-hauts-de-fully-vs',
'tags': ['Régions', 'RTS Info', 'Valais', "Toute l'info"],
},
'playlist': [{
'info_dict': {
'id': '12861415',
'ext': 'mp4',
'title': 'En Valais, un bouquetin emporté dans les airs par un aigle royal. Décryptage dune image rare.',
'thumbnail': r're:^https?://.*\.image',
'timestamp': 1644690600,
'upload_date': '20220212',
'duration': 107,
'categories': ['19h30'],
},
}],
'params': {'skip_download': 'm3u8'}, # 700-byte first fragment
'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
},
{
# new URL format; article with videos
'url': 'https://www.rts.ch/info/suisse/2024/article/doris-leuthard-il-y-a-des-alternatives-au-nucleaire-qui-sont-moins-risquees-28631869.html',
'info_dict': {
'id': '28631869',
'title': 'Doris Leuthard: "Il y a des alternatives au nucléaire qui sont moins risquées"',
'description': 'md5:ba9930e218dcd177801a34b89a16b86e',
'display_id': 'doris-leuthard-il-y-a-des-alternatives-au-nucleaire-qui-sont-moins-risquees',
'tags': 'count:13',
},
'playlist': [{
'info_dict': {
'id': '15162786',
'ext': 'mp4',
'title': 'L\'invitée de La Matinale (vidéo) - Doris Leuthard, co-présidente du projet d\'exposition nationale Svizra27',
'thumbnail': r're:^https?://.*\.image',
'upload_date': '20240916',
'timestamp': 1726462800,
'duration': 860,
'categories': ['La Matinale'],
},
}, {
'info_dict': {
'id': '15164848',
'ext': 'mp4',
'title': 'Le Centre pourrait faire pencher la balance en faveur de la construction de nouvelles centrales nucléaires',
'thumbnail': r're:^https?://.*\.image',
'upload_date': '20240916',
'timestamp': 1726502400,
'duration': 227,
'categories': ['Forum'],
},
}],
'params': {'skip_download': 'm3u8'}, # 700-byte first fragment
'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
}, },
{ {
'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html',
'only_matching': True, 'only_matching': True,
}, },
{
'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
'only_matching': True,
},
] ]
def _real_extract(self, url): def _real_extract(self, url):
m = self._match_valid_url(url) webpage, urlh = self._download_webpage_handle(url, self._match_id(url))
media_id = m.group('rts_id') or m.group('id') if urlh.url != url:
display_id = m.group('display_id') or media_id return self.url_result(urlh.url)
def download_json(internal_id): mobj = self._match_valid_url(url)
return self._download_json( display_id = traverse_obj(mobj, 'display_id', default=mobj.group('id')) or mobj.group('id')
f'http://www.rts.ch/a/{internal_id}.html?f=json/article',
display_id)
all_info = download_json(media_id) media_list = []
article_details = self._search_json(r'articleDetails\s*=\s*', webpage, 'article details', display_id)
traverse_obj(article_details, ('mainMedia', filter, {media_list.append}))
traverse_obj(article_details, ('innerMediaElements', filter, {media_list.extend}))
traverse_obj(article_details, ('mediaElements', filter, {media_list.extend}))
media_list = orderedSet(media_list)
# media_id extracted out of URL is not always a real id entries = []
if 'video' not in all_info and 'audio' not in all_info: for media in media_list:
entries = [] media_id = media['oid']
media_info = self._get_media_data('rts', media['type'], media_id)
for item in all_info.get('items', []): if fmts := self._extract_formats(media_info, media_id):
item_url = item.get('url') entries.append({
if not item_url: 'id': media_info['id'],
continue 'title': media_info['title'],
entries.append(self.url_result(item_url, 'RTS')) 'formats': fmts,
'description': media_info.get('description'),
'thumbnails': [traverse_obj(media_info, ('imageUrl', {lambda x: {
'url': x,
**parse_resolution(x),
}}))],
'timestamp': parse_iso8601(media_info.get('date')),
'duration': traverse_obj(media_info, ('duration', {functools.partial(int_or_none, scale=1000)})),
'categories': [media.get('category')],
})
if not entries: return self.playlist_result(
page, urlh = self._download_webpage_handle(url, display_id) entries, article_details.get('oid'), article_details.get('title'),
if re.match(self._VALID_URL, urlh.url).group('id') != media_id: article_details.get('lead'), display_id=display_id,
return self.url_result(urlh.url, 'RTS') tags=traverse_obj(article_details, ('tags', ..., 'name')))
# article with videos on rhs
videos = re.findall(
r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"',
page)
if not videos:
videos = re.findall(
r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"',
page)
if videos:
entries = [self.url_result(f'srgssr:{video_urn}', 'SRGSSR') for video_urn in videos]
if entries:
return self.playlist_result(entries, media_id, all_info.get('title'))
internal_id = self._html_search_regex(
r'<(?:video|audio) data-id="([0-9]+)"', page,
'internal video id')
all_info = download_json(internal_id)
media_type = 'video' if 'video' in all_info else 'audio'
# check for errors
self._get_media_data('rts', media_type, media_id)
info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
title = info['title']
def _extract_formats(self, media_info, media_id):
def extract_bitrate(url): def extract_bitrate(url):
return int_or_none(self._search_regex( return int_or_none(self._search_regex(
r'-([0-9]+)k\.', url, 'bitrate', default=None)) r'-([0-9]+)k\.', url, 'bitrate', default=None))
formats = [] formats = []
streams = info.get('streams', {}) for idx, stream in enumerate(traverse_obj(
for format_id, format_url in streams.items(): media_info, ('resourceList', lambda _, v: v['url']))):
if format_id == 'hds_sd' and 'hds' in streams: format_id = stream.get('protocol') or str(idx)
format_url = stream['url']
if format_id == 'hds_sd' and 'hds' in stream:
continue continue
if format_id == 'hls_sd' and 'hls' in streams: if format_id == 'hls_sd' and 'hls' in stream:
continue continue
ext = determine_ext(format_url) ext = determine_ext(format_url)
if ext in ('m3u8', 'f4m'): if ext in ('m3u8', 'f4m'):
@ -195,37 +303,5 @@ class RTSIE(SRGSSRIE): # XXX: Do not subclass from concrete IE
'tbr': extract_bitrate(format_url), 'tbr': extract_bitrate(format_url),
}) })
download_base = 'http://rtsww{}-d.rts.ch/'.format('-a' if media_type == 'audio' else '')
for media in info.get('media', []):
media_url = media.get('url')
if not media_url or re.match(r'https?://', media_url):
continue
rate = media.get('rate')
ext = media.get('ext') or determine_ext(media_url, 'mp4')
format_id = ext
if rate:
format_id += '-%dk' % rate
formats.append({
'format_id': format_id,
'url': urljoin(download_base, media_url),
'tbr': rate or extract_bitrate(media_url),
})
self._check_formats(formats, media_id) self._check_formats(formats, media_id)
return formats
duration = info.get('duration') or info.get('cutout') or info.get('cutduration')
if isinstance(duration, str):
duration = parse_duration(duration)
return {
'id': media_id,
'display_id': display_id,
'formats': formats,
'title': title,
'description': info.get('intro'),
'duration': duration,
'view_count': int_or_none(info.get('plays')),
'uploader': info.get('programName'),
'timestamp': parse_iso8601(info.get('broadcast_date')),
'thumbnail': unescapeHTML(info.get('preview_image_url')),
}

View File

@ -57,7 +57,7 @@ class SRGSSRIE(InfoExtractor):
def _get_media_data(self, bu, media_type, media_id): def _get_media_data(self, bu, media_type, media_id):
query = {'onlyChapters': True} if media_type == 'video' else {} query = {'onlyChapters': True} if media_type == 'video' else {}
full_media_data = self._download_json( full_media_data = self._download_json(
f'https://il.srgssr.ch/integrationlayer/2.0/{bu}/mediaComposition/{media_type}/{media_id}.json', f'https://il.srgssr.ch/integrationlayer/2.0/mediaComposition/byUrn/urn:{bu}:{media_type}:{media_id}.json',
media_id, query=query)['chapterList'] media_id, query=query)['chapterList']
try: try:
media_data = next( media_data = next(
@ -165,7 +165,7 @@ class SRGSSRPlayIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
'md5': '6db2226ba97f62ad42ce09783680046c', 'md5': '81c6ad90d774c46e3c54ea2f01a94db3',
'info_dict': { 'info_dict': {
'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',
'ext': 'mp4', 'ext': 'mp4',
@ -173,7 +173,7 @@ class SRGSSRPlayIE(InfoExtractor):
'title': 'Snowden beantragt Asyl in Russland', 'title': 'Snowden beantragt Asyl in Russland',
'timestamp': 1372708215, 'timestamp': 1372708215,
'duration': 113.827, 'duration': 113.827,
'thumbnail': r're:^https?://.*1383719781\.png$', 'thumbnail': r're:^https?://download-media\.srf\.ch/.*\.(?:png|jpg)$',
}, },
'expected_warnings': ['Unable to download f4m manifest'], 'expected_warnings': ['Unable to download f4m manifest'],
}, { }, {
@ -185,6 +185,7 @@ class SRGSSRPlayIE(InfoExtractor):
'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem',
'timestamp': 1444709160, 'timestamp': 1444709160,
'duration': 336.816, 'duration': 336.816,
'thumbnail': r're:^https?://download-media\.srf\.ch/.*\.(?:png|jpg)$',
}, },
'params': { 'params': {
# rtmp download # rtmp download
@ -217,7 +218,7 @@ class SRGSSRPlayIE(InfoExtractor):
'duration': 94.0, 'duration': 94.0,
'upload_date': '20170215', 'upload_date': '20170215',
'timestamp': 1487173560, 'timestamp': 1487173560,
'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', 'thumbnail': r're:https?://cdn\.prod\.swi-services\.ch/.+',
'subtitles': 'count:9', 'subtitles': 'count:9',
}, },
'params': { 'params': {