Compare commits

..

7 Commits

Author SHA1 Message Date
JC-Chung
585d0ed9ab
[ie/twitcasting] Detect livestreams via API and show page (#8601)
Authored by: JC-Chung, bashonly
2023-11-18 22:14:45 +00:00
SirElderling
1fa3f24d4b
[ie/theguardian] Add extractors (#8535)
Closes #8520
Authored by: SirElderling
2023-11-18 21:54:00 +00:00
sepro
ddb2d7588b
[ie] Extract from media elements in SMIL manifests (#8504)
Authored by: seproDev
2023-11-18 21:51:18 +00:00
qbnu
f223b1b078
[ie/vocaroo] Do not use deprecated getheader (#8606)
Authored by: qbnu
2023-11-18 21:49:23 +00:00
Berkay
6fe82491ed
[ie/twitter:broadcast] Extract concurrent_view_count (#8600)
Authored by: sonmezberkay
2023-11-18 21:46:22 +00:00
sepro
34df1c1f60
[ie/vidly] Add extractor (#8612)
Authored by: seproDev
2023-11-18 20:28:25 +00:00
Simon Sawicki
1d24da6c89
[ie/nintendo] Fix Nintendo Direct extraction (#8609)
Authored by: Grub4K
2023-11-18 21:04:42 +01:00
8 changed files with 359 additions and 64 deletions

View File

@ -2029,6 +2029,10 @@ from .tenplay import (
from .testurl import TestURLIE
from .tf1 import TF1IE
from .tfo import TFOIE
from .theguardian import (
TheGuardianPodcastIE,
TheGuardianPodcastPlaylistIE,
)
from .theholetv import TheHoleTvIE
from .theintercept import TheInterceptIE
from .theplatform import (
@ -2301,6 +2305,7 @@ from .vidio import (
VidioLiveIE
)
from .vidlii import VidLiiIE
from .vidly import VidlyIE
from .viewlift import (
ViewLiftIE,
ViewLiftEmbedIE,

View File

@ -2341,7 +2341,9 @@ class InfoExtractor:
imgs_count = 0
srcs = set()
media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
media = itertools.chain.from_iterable(
smil.findall(self._xpath_ns(arg, namespace))
for arg in ['.//video', './/audio', './/media'])
for medium in media:
src = medium.get('src')
if not src or src in srcs:

View File

@ -1,57 +1,131 @@
import re
import json
import urllib.parse
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..utils import (
ExtractorError,
make_archive_id,
unified_timestamp,
urljoin,
)
from ..utils.traversal import traverse_obj
class NintendoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:games/detail|nintendo-direct)/(?P<id>[^/?#&]+)'
_VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:(?P<locale>\w{2}(?:-\w{2})?)/)?nintendo-direct/(?P<slug>[^/?#]+)'
_TESTS = [{
'url': 'https://www.nintendo.com/games/detail/duck-hunt-wii-u/',
'info_dict': {
'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW',
'ext': 'flv',
'title': 'Duck Hunt Wii U VC NES - Trailer',
'duration': 60.326,
},
'params': {
'skip_download': True,
},
'add_ie': ['Ooyala'],
}, {
'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u',
'info_dict': {
'id': 'tokyo-mirage-sessions-fe-wii-u',
'title': 'Tokyo Mirage Sessions ♯FE',
},
'playlist_count': 4,
}, {
'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/',
'info_dict': {
'id': 'J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V',
'ext': 'mp4',
'title': 'Switch_ROS_ND0904-H264.mov',
'duration': 2324.758,
'id': '2oPmiviVePUA1IqAZzjuVh',
'display_id': '09-04-2019',
'title': 'Nintendo Direct 9.4.2019',
'timestamp': 1567580400,
'description': 'md5:8aac2780361d8cb772b6d1de66d7d6f4',
'upload_date': '20190904',
'age_limit': 17,
'_old_archive_ids': ['nintendo J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V'],
},
'params': {
'skip_download': True,
}, {
'url': 'https://www.nintendo.com/en-ca/nintendo-direct/08-31-2023/',
'info_dict': {
'ext': 'mp4',
'id': '2TB2w2rJhNYF84qQ9E57hU',
'display_id': '08-31-2023',
'title': 'Super Mario Bros. Wonder Direct 8.31.2023',
'timestamp': 1693465200,
'description': 'md5:3067c5b824bcfdae9090a7f38ab2d200',
'tags': ['Mild Fantasy Violence', 'In-Game Purchases'],
'upload_date': '20230831',
'age_limit': 6,
},
}, {
'url': 'https://www.nintendo.com/us/nintendo-direct/50-fact-extravaganza/',
'info_dict': {
'ext': 'mp4',
'id': 'j0BBGzfw0pQ',
'channel_follower_count': int,
'view_count': int,
'description': 'Learn new details about Super Smash Bros. for Wii U, which launches on November 21.',
'duration': 2123,
'availability': 'public',
'thumbnail': 'https://i.ytimg.com/vi_webp/j0BBGzfw0pQ/maxresdefault.webp',
'timestamp': 1414047600,
'channel_id': 'UCGIY_O-8vW4rfX98KlMkvRg',
'chapters': 'count:53',
'heatmap': 'count:100',
'upload_date': '20141023',
'uploader_id': '@NintendoAmerica',
'playable_in_embed': True,
'categories': ['Gaming'],
'display_id': '50-fact-extravaganza',
'channel': 'Nintendo of America',
'tags': ['Comic Mischief', 'Cartoon Violence', 'Mild Suggestive Themes'],
'like_count': int,
'channel_url': 'https://www.youtube.com/channel/UCGIY_O-8vW4rfX98KlMkvRg',
'age_limit': 10,
'uploader_url': 'https://www.youtube.com/@NintendoAmerica',
'comment_count': int,
'live_status': 'not_live',
'uploader': 'Nintendo of America',
'title': '50-FACT Extravaganza',
},
'add_ie': ['Ooyala'],
}]
def _create_asset_url(self, path):
return urljoin('https://assets.nintendo.com/', urllib.parse.quote(path))
def _real_extract(self, url):
page_id = self._match_id(url)
locale, slug = self._match_valid_url(url).group('locale', 'slug')
webpage = self._download_webpage(url, page_id)
language, _, country = (locale or 'US').rpartition('-')
parsed_locale = f'{language.lower() or "en"}_{country.upper()}'
self.write_debug(f'Using locale {parsed_locale} (from {locale})', only_once=True)
entries = [
OoyalaIE._build_url_result(m.group('code'))
for m in re.finditer(
r'data-(?:video-id|directVideoId)=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage)]
response = self._download_json('https://graph.nintendo.com/', slug, query={
'operationName': 'NintendoDirect',
'variables': json.dumps({
'locale': parsed_locale,
'slug': slug,
}, separators=(',', ':')),
'extensions': json.dumps({
'persistedQuery': {
'version': 1,
'sha256Hash': '969b16fe9f08b686fa37bc44d1fd913b6188e65794bb5e341c54fa683a8004cb'
},
}, separators=(',', ':')),
})
# API returns `{"data": {"direct": null}}` if no matching id
direct_info = traverse_obj(response, ('data', 'direct', {dict}))
if not direct_info:
raise ExtractorError(f'No Nintendo Direct with id {slug} exists', expected=True)
title = self._html_search_regex(
r'(?s)<(?:span|div)[^>]+class="(?:title|wrapper)"[^>]*>.*?<h1>(.+?)</h1>',
webpage, 'title', fatal=False)
errors = ', '.join(traverse_obj(response, ('errors', ..., 'message')))
if errors:
raise ExtractorError(f'GraphQL API error: {errors or "Unknown error"}')
return self.playlist_result(
entries, page_id, title)
result = traverse_obj(direct_info, {
'id': ('id', {str}),
'title': ('name', {str}),
'timestamp': ('startDate', {unified_timestamp}),
'description': ('description', 'text', {str}),
'age_limit': ('contentRating', 'order', {int}),
'tags': ('contentDescriptors', ..., 'label', {str}),
'thumbnail': ('thumbnail', {self._create_asset_url}),
})
result['display_id'] = slug
asset_id = traverse_obj(direct_info, ('video', 'publicId', {str}))
if not asset_id:
youtube_id = traverse_obj(direct_info, ('liveStream', {str}))
if not youtube_id:
self.raise_no_formats('Could not find any video formats', video_id=slug)
return self.url_result(youtube_id, **result, url_transparent=True)
if asset_id.startswith('Legacy Videos/'):
result['_old_archive_ids'] = [make_archive_id(self, asset_id[14:])]
result['formats'] = self._extract_m3u8_formats(
self._create_asset_url(f'/video/upload/sp_full_hd/v1/{asset_id}.m3u8'), slug)
return result

View File

@ -35,6 +35,7 @@ class PeriscopeBaseIE(InfoExtractor):
'uploader_id': broadcast.get('user_id') or broadcast.get('username'),
'thumbnails': thumbnails,
'view_count': int_or_none(broadcast.get('total_watched')),
'concurrent_view_count': int_or_none(broadcast.get('total_watching')),
'tags': broadcast.get('tags'),
'live_status': {
'running': 'is_live',

View File

@ -0,0 +1,135 @@
import itertools
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_class,
get_elements_html_by_class,
parse_qs,
traverse_obj,
unified_strdate,
urljoin
)
class TheGuardianPodcastIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
'md5': 'd1771744681789b4cd7da2a08e487702',
'info_dict': {
'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
'ext': 'mp3',
'title': 'We are just getting started: the plastic-eating bacteria that could change the world podcast',
'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee',
'creator': 'Stephen Buranyi',
'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79',
'release_date': '20231103'
}
}, {
'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
'md5': 'd1771744681789b4cd7da2a08e487702',
'info_dict': {
'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
'ext': 'mp3',
'title': 'The trials of Robert Habeck: is the worlds most powerful green politician doomed to fail? podcast',
'description': 'md5:1b5cf6582d1771c6b7077784b5456994',
'creator': 'Philip Oltermann',
'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080',
'release_date': '20231030'
}
}, {
'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
'md5': 'a2fcff6f8e060a95b1483295273dc35e',
'info_dict': {
'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
'ext': 'mp3',
'title': 'Arsenal feel hard done by and Luton hold Liverpool Football Weekly',
'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a',
'creator': 'Max Rushden',
'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd',
'release_date': '20231106'
}
}, {
'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast',
'md5': '06a0f7e9701a80c8064a5d35690481ec',
'info_dict': {
'id': 'the-covid-inquiry-politics-weekly-uk-podcast',
'ext': 'mp3',
'title': 'The Covid inquiry | Politics Weekly UK - podcast',
'description': 'md5:207c98859c14903582b17d25b014046e',
'creator': 'Gaby Hinsliff',
'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3',
'release_date': '20231102'
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
return {
'id': video_id,
'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage),
'description': self._og_search_description(webpage),
'creator': self._html_search_meta('author', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)),
'url': extract_attributes(get_element_html_by_class(
'podcast__player', webpage) or '').get('data-source'),
}
class TheGuardianPodcastPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?'
_TESTS = [{
'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly',
'info_dict': {
'id': 'theguardianswomensfootballweekly',
'title': "The Guardian's Women's Football Weekly",
'description': 'md5:e2cc021311e582d29935a73614a43f51'
},
'playlist_mincount': 69
}, {
'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2',
'info_dict': {
'id': 'todayinfocus',
'title': 'Today in Focus',
'description': 'md5:0f097764fc0d359e0b6eb537be0387e2'
},
'playlist_mincount': 1261
}, {
'url': 'https://www.theguardian.com/news/series/the-audio-long-read',
'info_dict': {
'id': 'the-audio-long-read',
'title': 'The Audio Long Read',
'description': 'md5:5462994a27527309562b25b6defc4ef3'
},
'playlist_mincount': 996
}]
def _entries(self, url, playlist_id):
for page in itertools.count(1):
webpage, urlh = self._download_webpage_handle(
url, playlist_id, f'Downloading page {page}', query={'page': page})
if 'page' not in parse_qs(urlh.url):
break
episodes = get_elements_html_by_class('fc-item--type-media', webpage)
for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'data-id')):
yield url_path
def _real_extract(self, url):
podcast_id = self._match_id(url)
webpage = self._download_webpage(url, podcast_id)
title = clean_html(get_element_by_class(
'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage))
description = self._og_search_description(webpage) or self._html_search_meta(
'description', webpage)
return self.playlist_from_matches(
self._entries(url, podcast_id), podcast_id, title, description=description,
ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x))

View File

@ -11,7 +11,6 @@ from ..utils import (
float_or_none,
get_element_by_class,
get_element_by_id,
int_or_none,
parse_duration,
qualities,
str_to_int,
@ -242,35 +241,31 @@ class TwitCastingLiveIE(InfoExtractor):
'expected_exception': 'UserNotLive',
}]
_PROTECTED_LIVE_RE = r'(?s)(<span\s*class="tw-movie-thumbnail2-badge"\s*data-status="live">\s*LIVE)'
def _real_extract(self, url):
uploader_id = self._match_id(url)
self.to_screen(
'Downloading live video of user {0}. '
'Pass "https://twitcasting.tv/{0}/show" to download the history'.format(uploader_id))
webpage = self._download_webpage(url, uploader_id)
is_live = self._search_regex( # first pattern is for public live
(r'(data-is-onlive="true")', self._PROTECTED_LIVE_RE), webpage, 'is live?', default=None)
current_live = int_or_none(self._search_regex(
(r'data-type="movie" data-id="(\d+)">', # not available?
r'tw-sound-flag-open-link" data-id="(\d+)" style=', # not available?
r'data-movie-id="(\d+)"'), # if not currently live, value may be 0
webpage, 'current live ID', default=None))
if is_live and not current_live:
# fetch unfiltered /show to find running livestreams; we can't get ID of the password-protected livestream above
webpage = self._download_webpage(
f'https://twitcasting.tv/{uploader_id}/show/', uploader_id,
note='Downloading live history')
is_live = self._search_regex(self._PROTECTED_LIVE_RE, webpage, 'is live?', default=None)
if is_live:
# get the first live; running live is always at the first
current_live = self._search_regex(
r'(?s)<a\s+class="tw-movie-thumbnail2"\s*href="/[^/]+/movie/(?P<video_id>\d+)"\s*>.+?</a>',
webpage, 'current live ID 2', default=None, group='video_id')
if not current_live:
is_live = traverse_obj(self._download_json(
f'https://frontendapi.twitcasting.tv/watch/user/{uploader_id}',
uploader_id, 'Checking live status', data=b'', fatal=False), ('is_live', {bool}))
if is_live is False: # only raise here if API response was as expected
raise UserNotLive(video_id=uploader_id)
# Use /show/ page so that password-protected and members-only livestreams can be found
webpage = self._download_webpage(
f'https://twitcasting.tv/{uploader_id}/show/', uploader_id, 'Downloading live history')
is_live = is_live or self._search_regex(
r'(?s)(<span\s*class="tw-movie-thumbnail2-badge"\s*data-status="live">\s*LIVE)',
webpage, 'is live?', default=False)
# Current live is always the first match
current_live = self._search_regex(
r'(?s)<a\s+class="tw-movie-thumbnail2"\s+href="/[^/"]+/movie/(?P<video_id>\d+)"',
webpage, 'current live ID', default=None, group='video_id')
if not is_live or not current_live:
raise UserNotLive(video_id=uploader_id)
return self.url_result(f'https://twitcasting.tv/{uploader_id}/movie/{current_live}', TwitCastingIE)

83
yt_dlp/extractor/vidly.py Normal file
View File

@ -0,0 +1,83 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
mimetype2ext,
url_or_none,
)
from ..utils.traversal import traverse_obj
class VidlyIE(InfoExtractor):
_VALID_URL = r'https?://(?:vid\.ly/|(?:s\.)?vid\.ly/embeded\.html\?(?:[^#]+&)?link=)(?P<id>\w+)'
_EMBED_REGEX = [r'<script[^>]+\bsrc=[\'"](?P<url>(?:https?:)?//vid\.ly/\w+/embed[^\'"]+)',
r'<iframe[^>]+\bsrc=[\'"](?P<url>(?:https?:)?//(?:s\.)?vid\.ly/embeded\.html\?(?:[^#\'"]+&)?link=\w+[^\'"]+)']
_TESTS = [{
# JWPlayer 7, Embeds forbidden
'url': 'https://vid.ly/2i3o9j/embed',
'info_dict': {
'id': '2i3o9j',
'ext': 'mp4',
'title': '2i3o9j',
'thumbnail': r're:https://\w+\.cloudfront\.net/',
},
}, {
# JWPlayer 6
'url': 'http://s.vid.ly/embeded.html?link=jw_test&new=1&autoplay=true&controls=true',
'info_dict': {
'id': 'jw_test',
'ext': 'mp4',
'title': '2x8m8t',
'thumbnail': r're:https://\w+\.cloudfront\.net/',
},
}, {
# Vidlyplayer
'url': 'https://vid.ly/7x0e6l',
'info_dict': {
'id': '7x0e6l',
'ext': 'mp4',
'title': '7x0e6l',
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.petfinder.com/dog/gus-57378930/tn/ooltewah/furever-furkids-rescue-tn592/',
'info_dict': {
'id': 'w8p5b0',
'ext': 'mp4',
'title': 'w8p5b0',
'thumbnail': r're:https://\w+\.cloudfront\.net/',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
embed_script = self._download_webpage(
f'https://vid.ly/{video_id}/embed', video_id, headers={'Referer': 'https://vid.ly/'})
player = self._search_json(r'initCallback\(', embed_script, 'player', video_id)
player_type = player.get('player') or ''
if player_type.startswith('jwplayer'):
return self._parse_jwplayer_data(player['config'], video_id)
elif not player_type.startswith('vidly'):
raise ExtractorError(f'Unknown player type {player_type!r}')
formats = []
ext = mimetype2ext(traverse_obj(player, ('config', 'type')))
for source, fid in [('source', 'sd'), ('source_hd', 'hd')]:
if traverse_obj(player, ('config', source, {url_or_none})):
formats.append({
'url': player['config'][source],
'format_id': f'http-{fid}',
'ext': ext,
})
# Has higher quality formats
formats.extend(self._extract_m3u8_formats(
f'https://d3fenhwk93s16g.cloudfront.net/{video_id}/hls.m3u8', video_id,
fatal=False, note='Requesting higher quality m3u8 formats',
errnote='No higher quality m3u8 formats found') or [])
return {
'id': video_id,
'title': video_id,
'formats': formats,
}

View File

@ -57,7 +57,7 @@ class VocarooIE(InfoExtractor):
'title': '',
'url': url,
'ext': 'mp3',
'timestamp': float_or_none(resp.getheader('x-bz-upload-timestamp'), scale=1000),
'timestamp': float_or_none(resp.headers.get('x-bz-upload-timestamp'), scale=1000),
'vcodec': 'none',
'http_headers': http_headers,
}