Compare commits

...

16 Commits

Author SHA1 Message Date
Nicolas Dato
6511c09851 [rudovideo] forgot to rename a variable 2023-12-21 15:41:07 -03:00
Nicolas Dato
9c8175c7e0 [rudovideo] adding support for MP3 podcasts 2023-12-21 15:39:16 -03:00
Nicolas Dato
878c6bcace Merge branch 'master' into c13cl 2023-12-21 15:30:26 -03:00
bashonly
37755a037e
[test:networking] Update tests for OpenSSL 3.2 (#8814)
Authored by: bashonly
2023-12-20 19:03:54 +00:00
coletdjnz
196eb0fe77
[networking] Strip whitespace around header values (#8802)
Fixes https://github.com/yt-dlp/yt-dlp/issues/8729
Authored by: coletdjnz
2023-12-20 19:15:38 +13:00
Mozi
db8b4edc7d
[ie/JoqrAg] Add extractor (#8384)
Authored by: pzhlkj6612
2023-12-19 14:21:47 +00:00
bashonly
1c54a98e19
[ie/twitter] Extract stale tweets (#8724)
Closes #8691
Authored by: bashonly
2023-12-19 13:24:55 +00:00
Simon Sawicki
00a3e47bf5
[ie/bundestag] Add extractor (#8783)
Authored by: Grub4K
2023-12-18 21:32:08 +01:00
Amir Y. Perehodnik
c5f01bf7d4
[ie/Maariv] Add extractor (#8331)
Authored by: amir16yp
2023-12-18 16:52:43 +01:00
Tristan Charpentier
c91af948e4
[ie/RinseFM] Add extractor (#8778)
Authored by: hashFactory
2023-12-17 14:07:55 +00:00
Pandey Ganesha
6b5d93b0b0
[ie/youtube] Fix like_count extraction (#8763)
Closes #8759
Authored by: Ganesh910
2023-12-13 07:04:12 +00:00
pukkandan
298230e550
[webvtt] Fix 15f22b4880 2023-12-13 05:11:45 +05:30
Mozi
d5d1517e7d
[ie/eplus] Add login support and DRM detection (#8661)
Authored by: pzhlkj6612
2023-12-12 00:29:36 +00:00
trainman261
7e09c147fd
[ie/theplatform] Extract more metadata (#8635)
Authored by: trainman261
2023-12-12 00:00:35 +00:00
Benjamin Krausse
e370f9ec36
[ie] Add media_type field
Authored by: trainman261
2023-12-11 17:57:41 -06:00
SirElderling
b1a1ec1540
[ie/bitchute] Fix and improve metadata extraction (#8507)
Closes #8492
Authored by: SirElderling
2023-12-11 23:56:01 +00:00
24 changed files with 640 additions and 58 deletions

View File

@ -1333,6 +1333,7 @@ The available fields are:
- `was_live` (boolean): Whether this video was originally a live stream - `was_live` (boolean): Whether this video was originally a live stream
- `playable_in_embed` (string): Whether this video is allowed to play in embedded players on other sites - `playable_in_embed` (string): Whether this video is allowed to play in embedded players on other sites
- `availability` (string): Whether the video is "private", "premium_only", "subscriber_only", "needs_auth", "unlisted" or "public" - `availability` (string): Whether the video is "private", "premium_only", "subscriber_only", "needs_auth", "unlisted" or "public"
- `media_type` (string): The type of media as classified by the site, e.g. "episode", "clip", "trailer"
- `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL - `start_time` (numeric): Time in seconds where the reproduction should start, as specified in the URL
- `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL - `end_time` (numeric): Time in seconds where the reproduction should end, as specified in the URL
- `extractor` (string): Name of the extractor - `extractor` (string): Name of the extractor

View File

@ -328,7 +328,7 @@ class TestHTTPRequestHandler(TestRequestHandlerBase):
https_server_thread.start() https_server_thread.start()
with handler(verify=False) as rh: with handler(verify=False) as rh:
with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info: with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info:
validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers')) validate_and_send(rh, Request(f'https://127.0.0.1:{https_port}/headers'))
assert not issubclass(exc_info.type, CertificateVerifyError) assert not issubclass(exc_info.type, CertificateVerifyError)

View File

@ -2370,6 +2370,11 @@ Line 1
headers4 = HTTPHeaderDict({'ytdl-test': 'data;'}) headers4 = HTTPHeaderDict({'ytdl-test': 'data;'})
self.assertEqual(set(headers4.items()), {('Ytdl-Test', 'data;')}) self.assertEqual(set(headers4.items()), {('Ytdl-Test', 'data;')})
# common mistake: strip whitespace from values
# https://github.com/yt-dlp/yt-dlp/issues/8729
headers5 = HTTPHeaderDict({'ytdl-test': ' data; '})
self.assertEqual(set(headers5.items()), {('Ytdl-Test', 'data;')})
def test_extract_basic_auth(self): def test_extract_basic_auth(self):
assert extract_basic_auth('http://:foo.bar') == ('http://:foo.bar', None) assert extract_basic_auth('http://:foo.bar') == ('http://:foo.bar', None)
assert extract_basic_auth('http://foo.bar') == ('http://foo.bar', None) assert extract_basic_auth('http://foo.bar') == ('http://foo.bar', None)

View File

@ -148,7 +148,7 @@ class TestWebsSocketRequestHandlerConformance:
@pytest.mark.parametrize('handler', ['Websockets'], indirect=True) @pytest.mark.parametrize('handler', ['Websockets'], indirect=True)
def test_ssl_error(self, handler): def test_ssl_error(self, handler):
with handler(verify=False) as rh: with handler(verify=False) as rh:
with pytest.raises(SSLError, match='sslv3 alert handshake failure') as exc_info: with pytest.raises(SSLError, match=r'ssl(?:v3|/tls) alert handshake failure') as exc_info:
validate_and_send(rh, Request(self.bad_wss_host)) validate_and_send(rh, Request(self.bad_wss_host))
assert not issubclass(exc_info.type, CertificateVerifyError) assert not issubclass(exc_info.type, CertificateVerifyError)

View File

@ -276,6 +276,7 @@ from .brilliantpala import (
) )
from .businessinsider import BusinessInsiderIE from .businessinsider import BusinessInsiderIE
from .bundesliga import BundesligaIE from .bundesliga import BundesligaIE
from .bundestag import BundestagIE
from .buzzfeed import BuzzFeedIE from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE from .byutv import BYUtvIE
from .c56 import C56IE from .c56 import C56IE
@ -864,6 +865,7 @@ from .jiosaavn import (
) )
from .jove import JoveIE from .jove import JoveIE
from .joj import JojIE from .joj import JojIE
from .joqrag import JoqrAgIE
from .jstream import JStreamIE from .jstream import JStreamIE
from .jtbc import ( from .jtbc import (
JTBCIE, JTBCIE,
@ -991,6 +993,7 @@ from .lynda import (
LyndaIE, LyndaIE,
LyndaCourseIE LyndaCourseIE
) )
from .maariv import MaarivIE
from .magellantv import MagellanTVIE from .magellantv import MagellanTVIE
from .magentamusik360 import MagentaMusik360IE from .magentamusik360 import MagentaMusik360IE
from .mailru import ( from .mailru import (
@ -1590,6 +1593,7 @@ from .restudy import RestudyIE
from .reuters import ReutersIE from .reuters import ReutersIE
from .reverbnation import ReverbNationIE from .reverbnation import ReverbNationIE
from .rheinmaintv import RheinMainTVIE from .rheinmaintv import RheinMainTVIE
from .rinsefm import RinseFMIE
from .rmcdecouverte import RMCDecouverteIE from .rmcdecouverte import RMCDecouverteIE
from .rockstargames import RockstarGamesIE from .rockstargames import RockstarGamesIE
from .rokfin import ( from .rokfin import (

View File

@ -121,11 +121,21 @@ class AENetworksIE(AENetworksBaseIE):
'info_dict': { 'info_dict': {
'id': '22253814', 'id': '22253814',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Winter is Coming', 'title': 'Winter Is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', 'description': 'md5:a40e370925074260b1c8a633c632c63a',
'timestamp': 1338306241, 'timestamp': 1338306241,
'upload_date': '20120529', 'upload_date': '20120529',
'uploader': 'AENE-NEW', 'uploader': 'AENE-NEW',
'duration': 2592.0,
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:5',
'tags': 'count:14',
'categories': ['Mountain Men'],
'episode_number': 1,
'episode': 'Episode 1',
'season': 'Season 1',
'season_number': 1,
'series': 'Mountain Men',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
@ -143,6 +153,15 @@ class AENetworksIE(AENetworksBaseIE):
'timestamp': 1452634428, 'timestamp': 1452634428,
'upload_date': '20160112', 'upload_date': '20160112',
'uploader': 'AENE-NEW', 'uploader': 'AENE-NEW',
'duration': 1277.695,
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:4',
'tags': 'count:23',
'episode': 'Episode 1',
'episode_number': 1,
'season': 'Season 9',
'season_number': 9,
'series': 'Duck Dynasty',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download

View File

@ -7,8 +7,10 @@ from ..utils import (
ExtractorError, ExtractorError,
OnDemandPagedList, OnDemandPagedList,
clean_html, clean_html,
extract_attributes,
get_element_by_class, get_element_by_class,
get_element_by_id, get_element_by_id,
get_element_html_by_class,
get_elements_html_by_class, get_elements_html_by_class,
int_or_none, int_or_none,
orderedSet, orderedSet,
@ -17,6 +19,7 @@ from ..utils import (
traverse_obj, traverse_obj,
unified_strdate, unified_strdate,
urlencode_postdata, urlencode_postdata,
urljoin,
) )
@ -34,6 +37,25 @@ class BitChuteIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute', 'uploader': 'BitChute',
'upload_date': '20170103', 'upload_date': '20170103',
'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
'channel': 'BitChute',
'channel_url': 'https://www.bitchute.com/channel/bitchute/'
},
}, {
# test case: video with different channel and uploader
'url': 'https://www.bitchute.com/video/Yti_j9A-UZ4/',
'md5': 'f10e6a8e787766235946d0868703f1d0',
'info_dict': {
'id': 'Yti_j9A-UZ4',
'ext': 'mp4',
'title': 'Israel at War | Full Measure',
'description': 'md5:38cf7bc6f42da1a877835539111c69ef',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'sharylattkisson',
'upload_date': '20231106',
'uploader_url': 'https://www.bitchute.com/profile/9K0kUWA9zmd9/',
'channel': 'Full Measure with Sharyl Attkisson',
'channel_url': 'https://www.bitchute.com/channel/sharylattkisson/'
}, },
}, { }, {
# video not downloadable in browser, but we can recover it # video not downloadable in browser, but we can recover it
@ -48,6 +70,9 @@ class BitChuteIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute', 'uploader': 'BitChute',
'upload_date': '20181113', 'upload_date': '20181113',
'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
'channel': 'BitChute',
'channel_url': 'https://www.bitchute.com/channel/bitchute/'
}, },
'params': {'check_formats': None}, 'params': {'check_formats': None},
}, { }, {
@ -99,6 +124,11 @@ class BitChuteIE(InfoExtractor):
reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title
self.raise_geo_restricted(reason) self.raise_geo_restricted(reason)
@staticmethod
def _make_url(html):
path = extract_attributes(get_element_html_by_class('spa', html) or '').get('href')
return urljoin('https://www.bitchute.com', path)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage( webpage = self._download_webpage(
@ -121,12 +151,19 @@ class BitChuteIE(InfoExtractor):
'Video is unavailable. Please make sure this video is playable in the browser ' 'Video is unavailable. Please make sure this video is playable in the browser '
'before reporting this issue.', expected=True, video_id=video_id) 'before reporting this issue.', expected=True, video_id=video_id)
details = get_element_by_class('details', webpage) or ''
uploader_html = get_element_html_by_class('creator', details) or ''
channel_html = get_element_html_by_class('name', details) or ''
return { return {
'id': video_id, 'id': video_id,
'title': self._html_extract_title(webpage) or self._og_search_title(webpage), 'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
'description': self._og_search_description(webpage, default=None), 'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'uploader': clean_html(get_element_by_class('owner', webpage)), 'uploader': clean_html(uploader_html),
'uploader_url': self._make_url(uploader_html),
'channel': clean_html(channel_html),
'channel_url': self._make_url(channel_html),
'upload_date': unified_strdate(self._search_regex( 'upload_date': unified_strdate(self._search_regex(
r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
'formats': formats, 'formats': formats,
@ -154,6 +191,9 @@ class BitChuteChannelIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'BitChute', 'uploader': 'BitChute',
'upload_date': '20170103', 'upload_date': '20170103',
'uploader_url': 'https://www.bitchute.com/profile/I5NgtHZn9vPj/',
'channel': 'BitChute',
'channel_url': 'https://www.bitchute.com/channel/bitchute/',
'duration': 16, 'duration': 16,
'view_count': int, 'view_count': int,
}, },
@ -169,7 +209,7 @@ class BitChuteChannelIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'wV9Imujxasw9', 'id': 'wV9Imujxasw9',
'title': 'Bruce MacDonald and "The Light of Darkness"', 'title': 'Bruce MacDonald and "The Light of Darkness"',
'description': 'md5:04913227d2714af1d36d804aa2ab6b1e', 'description': 'md5:747724ef404eebdfc04277714f81863e',
} }
}] }]

View File

@ -0,0 +1,123 @@
import re
from functools import partial
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
bug_reports_message,
clean_html,
format_field,
get_element_text_and_html_by_tag,
int_or_none,
url_or_none,
)
from ..utils.traversal import traverse_obj
class BundestagIE(InfoExtractor):
_VALID_URL = [
r'https?://dbtg\.tv/[cf]vid/(?P<id>\d+)',
r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P<id>\d+)',
]
_TESTS = [{
'url': 'https://dbtg.tv/cvid/7605304',
'info_dict': {
'id': '7605304',
'ext': 'mp4',
'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit',
'description': 'md5:321a9dc6bdad201264c0045efc371561',
},
}, {
'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek',
'info_dict': {
'id': '7602120',
'ext': 'mp4',
'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung',
'description': 'Befragung der Bundesregierung',
},
}, {
'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek',
'only_matching': True,
}, {
'url': 'http://dbtg.tv/fvid/3594346',
'only_matching': True,
}]
_OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay'
_INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8'
_SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId='
_SHARE_AUDIO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<bitrate>\d+)kb_(?P<channels>\w+)_\w+_\d+\.(?P<ext>\w+)'
_SHARE_VIDEO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<width>\w+)_(?P<height>\w+)_(?P<bitrate>\d+)kb_\w+_\w+_\d+\.(?P<ext>\w+)'
def _bt_extract_share_formats(self, video_id):
share_data = self._download_json(
f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON')
if traverse_obj(share_data, ('status', 'code', {int})) != 1:
self.report_warning(format_field(
share_data, [('status', 'message', {str})],
'Share API response: %s', default='Unknown Share API Error')
+ bug_reports_message())
return
for name, url in share_data.items():
if not isinstance(name, str) or not url_or_none(url):
continue
elif name.startswith('audio'):
match = re.search(self._SHARE_AUDIO_REGEX, url)
yield {
'format_id': name,
'url': url,
'vcodec': 'none',
**traverse_obj(match, {
'acodec': 'codec',
'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}),
'abr': ('bitrate', {int_or_none}),
'ext': 'ext',
}),
}
elif name.startswith('download'):
match = re.search(self._SHARE_VIDEO_REGEX, url)
yield {
'format_id': name,
'url': url,
**traverse_obj(match, {
'vcodec': 'codec',
'tbr': ('bitrate', {int_or_none}),
'width': ('width', {int_or_none}),
'height': ('height', {int_or_none}),
'ext': 'ext',
}),
}
def _real_extract(self, url):
video_id = self._match_id(url)
formats = []
result = {'id': video_id, 'formats': formats}
try:
formats.extend(self._extract_m3u8_formats(
self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance'))
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 404:
raise ExtractorError('Could not find video id', expected=True)
self.report_warning(f'Error extracting hls formats: {error}', video_id)
formats.extend(self._bt_extract_share_formats(video_id))
if not formats:
self.raise_no_formats('Could not find suitable formats', video_id=video_id)
result.update(traverse_obj(self._download_webpage(
self._OVERLAY_URL, video_id,
query={'videoid': video_id, 'view': 'main'},
note='Downloading metadata overlay', fatal=False,
), {
'title': (
{partial(get_element_text_and_html_by_tag, 'h3')}, 0,
{partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
}))
return result

View File

@ -180,6 +180,13 @@ class CBCPlayerIE(InfoExtractor):
'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
'chapters': [], 'chapters': [],
'duration': 494.811, 'duration': 494.811,
'categories': ['AudioMobile/All in a Weekend Montreal'],
'tags': 'count:8',
'location': 'Quebec',
'series': 'All in a Weekend Montreal',
'season': 'Season 2015',
'season_number': 2015,
'media_type': 'Excerpt',
}, },
}, { }, {
'url': 'http://www.cbc.ca/player/play/2164402062', 'url': 'http://www.cbc.ca/player/play/2164402062',
@ -195,25 +202,37 @@ class CBCPlayerIE(InfoExtractor):
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
'chapters': [], 'chapters': [],
'duration': 186.867, 'duration': 186.867,
'series': 'CBC News: Windsor at 6:00',
'categories': ['News/Canada/Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
'creator': 'Allison Johnson',
'media_type': 'Excerpt',
}, },
}, { }, {
# Has subtitles # Has subtitles
# These broadcasts expire after ~1 month, can find new test URL here: # These broadcasts expire after ~1 month, can find new test URL here:
# https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
'url': 'http://www.cbc.ca/player/play/2249992771553', 'url': 'http://www.cbc.ca/player/play/2284799043667',
'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd', 'md5': '9b49f0839e88b6ec0b01d840cf3d42b5',
'info_dict': { 'info_dict': {
'id': '2249992771553', 'id': '2284799043667',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The National | Womens soccer pay, Florida seawater, Swift quake', 'title': 'The National | Hockey coach charged, Green grants, Safer drugs',
'description': 'md5:adba28011a56cfa47a080ff198dad27a', 'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa',
'timestamp': 1690596000, 'timestamp': 1700272800,
'duration': 2716.333, 'duration': 2718.833,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg', 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg',
'uploader': 'CBCC-NEW', 'uploader': 'CBCC-NEW',
'chapters': 'count:5', 'chapters': 'count:5',
'upload_date': '20230729', 'upload_date': '20231118',
'categories': 'count:4',
'series': 'The National - Full Show',
'tags': 'count:1',
'creator': 'News',
'location': 'Canada',
'media_type': 'Full Program',
}, },
}] }]

View File

@ -382,6 +382,7 @@ class InfoExtractor:
'private', 'premium_only', 'subscriber_only', 'needs_auth', 'private', 'premium_only', 'subscriber_only', 'needs_auth',
'unlisted' or 'public'. Use 'InfoExtractor._availability' 'unlisted' or 'public'. Use 'InfoExtractor._availability'
to set it to set it
media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
_old_archive_ids: A list of old archive ids needed for backward compatibility _old_archive_ids: A list of old archive ids needed for backward compatibility
_format_sort_fields: A list of fields to use for sorting formats _format_sort_fields: A list of fields to use for sorting formats
__post_extractor: A function to be called just before the metadata is __post_extractor: A function to be called just before the metadata is

View File

@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor):
'timestamp': 1444107300, 'timestamp': 1444107300,
'age_limit': 14, 'age_limit': 14,
'uploader': 'CWTV', 'uploader': 'CWTV',
'thumbnail': r're:^https?://.*\.jpe?g$',
'chapters': 'count:4',
'episode': 'Episode 20',
'season': 'Season 11',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download

View File

@ -1,15 +1,20 @@
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
try_call, try_call,
unified_timestamp, unified_timestamp,
urlencode_postdata,
) )
class EplusIbIE(InfoExtractor): class EplusIbIE(InfoExtractor):
IE_NAME = 'eplus:inbound' _NETRC_MACHINE = 'eplus'
IE_DESC = 'e+ (イープラス) overseas' IE_NAME = 'eplus'
_VALID_URL = r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)' IE_DESC = 'e+ (イープラス)'
_VALID_URL = [r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)',
r'https?://live\.eplus\.jp/(?P<id>sample|\d+)']
_TESTS = [{ _TESTS = [{
'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D', 'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D',
'info_dict': { 'info_dict': {
@ -29,14 +34,97 @@ class EplusIbIE(InfoExtractor):
'No video formats found!', 'No video formats found!',
'Requested format is not available', 'Requested format is not available',
], ],
}, {
'url': 'https://live.eplus.jp/sample',
'info_dict': {
'id': 'stream1ng20210719-test-005',
'title': 'Online streaming test for DRM',
'live_status': 'was_live',
'release_date': '20210719',
'release_timestamp': 1626703200,
'description': None,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
'expected_warnings': [
'Could not find the playlist URL. This event may not be accessible',
'No video formats found!',
'Requested format is not available',
'This video is DRM protected',
],
}, {
'url': 'https://live.eplus.jp/2053935',
'info_dict': {
'id': '331320-0001-001',
'title': '丘みどり2020配信LIVE Vol.2 ~秋麗~ 【Streaming+(配信チケット)】',
'live_status': 'was_live',
'release_date': '20200920',
'release_timestamp': 1600596000,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
'expected_warnings': [
'Could not find the playlist URL. This event may not be accessible',
'No video formats found!',
'Requested format is not available',
],
}] }]
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
def _login(self, username, password, urlh):
if not self._get_cookies('https://live.eplus.jp/').get('ci_session'):
raise ExtractorError('Unable to get ci_session cookie')
cltft_token = urlh.headers.get('X-CLTFT-Token')
if not cltft_token:
raise ExtractorError('Unable to get X-CLTFT-Token')
self._set_cookie('live.eplus.jp', 'X-CLTFT-Token', cltft_token)
login_json = self._download_json(
'https://live.eplus.jp/member/api/v1/FTAuth/idpw', None,
note='Sending pre-login info', errnote='Unable to send pre-login info', headers={
'Content-Type': 'application/json; charset=UTF-8',
'Referer': urlh.url,
'X-Cltft-Token': cltft_token,
'Accept': '*/*',
}, data=json.dumps({
'loginId': username,
'loginPassword': password,
}).encode())
if not login_json.get('isSuccess'):
raise ExtractorError('Login failed: Invalid id or password', expected=True)
self._request_webpage(
urlh.url, None, note='Logging in', errnote='Unable to log in',
data=urlencode_postdata({
'loginId': username,
'loginPassword': password,
'Token.Default': cltft_token,
'op': 'nextPage',
}), headers={'Referer': urlh.url})
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage, urlh = self._download_webpage_handle(
url, video_id, headers={'User-Agent': self._USER_AGENT})
if urlh.url.startswith('https://live.eplus.jp/member/auth'):
username, password = self._get_login_info()
if not username:
self.raise_login_required()
self._login(username, password, urlh)
webpage = self._download_webpage(
url, video_id, headers={'User-Agent': self._USER_AGENT})
data_json = self._search_json(r'<script>\s*var app\s*=', webpage, 'data json', video_id) data_json = self._search_json(r'<script>\s*var app\s*=', webpage, 'data json', video_id)
if data_json.get('drm_mode') == 'ON':
self.report_drm(video_id)
delivery_status = data_json.get('delivery_status') delivery_status = data_json.get('delivery_status')
archive_mode = data_json.get('archive_mode') archive_mode = data_json.get('archive_mode')
release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400) release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400)
@ -64,7 +152,7 @@ class EplusIbIE(InfoExtractor):
formats = [] formats = []
m3u8_playlist_urls = self._search_json( m3u8_playlist_urls = self._search_json(
r'var listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[]) r'var\s+listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[])
if not m3u8_playlist_urls: if not m3u8_playlist_urls:
if live_status == 'is_upcoming': if live_status == 'is_upcoming':
self.raise_no_formats( self.raise_no_formats(

112
yt_dlp/extractor/joqrag.py Normal file
View File

@ -0,0 +1,112 @@
import datetime
import urllib.parse
from .common import InfoExtractor
from ..utils import (
clean_html,
datetime_from_str,
unified_timestamp,
urljoin,
)
class JoqrAgIE(InfoExtractor):
IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)'
_VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php',
r'https?://(?:www\.)?joqr\.co\.jp/ag/',
r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])']
_TESTS = [{
'url': 'https://www.uniqueradio.jp/agplayer5/player.php',
'info_dict': {
'id': 'live',
'title': str,
'channel': '超!A&G+',
'description': str,
'live_status': 'is_live',
'release_timestamp': int,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}, {
'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php',
'only_matching': True,
}, {
'url': 'https://www.joqr.co.jp/ag/article/103760/',
'only_matching': True,
}, {
'url': 'http://www.joqr.co.jp/qr/agdailyprogram/',
'only_matching': True,
}, {
'url': 'http://www.joqr.co.jp/qr/agregularprogram/',
'only_matching': True,
}]
def _extract_metadata(self, variable, html):
return clean_html(urllib.parse.unquote_plus(self._search_regex(
rf'var\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
html, 'metadata', group='value', default=''))) or None
def _extract_start_timestamp(self, video_id, is_live):
def extract_start_time_from(date_str):
dt = datetime_from_str(date_str) + datetime.timedelta(hours=9)
date = dt.strftime('%Y%m%d')
start_time = self._search_regex(
r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+\s*(\d{1,2}:\d{1,2})',
self._download_webpage(
f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id,
note=f'Downloading program list of {date}', fatal=False,
errnote=f'Failed to download program list of {date}') or '',
'start time', default=None)
if start_time:
return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00')
return None
start_timestamp = extract_start_time_from('today')
if not start_timestamp:
return None
if not is_live or start_timestamp < datetime_from_str('now').timestamp():
return start_timestamp
else:
return extract_start_time_from('yesterday')
def _real_extract(self, url):
video_id = 'live'
metadata = self._download_webpage(
'https://www.uniqueradio.jp/aandg', video_id,
note='Downloading metadata', errnote='Failed to download metadata')
title = self._extract_metadata('Program_name', metadata)
if title == '放送休止':
formats = []
live_status = 'is_upcoming'
release_timestamp = self._extract_start_timestamp(video_id, False)
msg = 'This stream is not currently live'
if release_timestamp:
msg += (' and will start at '
+ datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S'))
self.raise_no_formats(msg, expected=True)
else:
m3u8_path = self._search_regex(
r'<source\s[^>]*\bsrc="([^"]+)"',
self._download_webpage(
'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id,
note='Downloading player data', errnote='Failed to download player data'),
'm3u8 url')
formats = self._extract_m3u8_formats(
urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id)
live_status = 'is_live'
release_timestamp = self._extract_start_timestamp(video_id, True)
return {
'id': video_id,
'title': title,
'channel': '超!A&G+',
'description': self._extract_metadata('Program_text', metadata),
'formats': formats,
'live_status': live_status,
'release_timestamp': release_timestamp,
}

View File

@ -0,0 +1,62 @@
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_resolution,
unified_timestamp,
url_or_none,
)
from ..utils.traversal import traverse_obj
class MaarivIE(InfoExtractor):
IE_NAME = 'maariv.co.il'
_VALID_URL = r'https?://player\.maariv\.co\.il/public/player\.html\?(?:[^#]+&)?media=(?P<id>\d+)'
_EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://player.maariv.co.il/public/player.html?player=maariv-desktop&media=3611585',
'info_dict': {
'id': '3611585',
'duration': 75,
'ext': 'mp4',
'upload_date': '20231009',
'title': 'מבצע חרבות ברזל',
'timestamp': 1696851301,
},
}]
_WEBPAGE_TESTS = [{
'url': 'https://www.maariv.co.il/news/law/Article-1044008',
'info_dict': {
'id': '3611585',
'duration': 75,
'ext': 'mp4',
'upload_date': '20231009',
'title': 'מבצע חרבות ברזל',
'timestamp': 1696851301,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
f'https://dal.walla.co.il/media/{video_id}?origin=player.maariv.co.il', video_id)['data']
formats = []
if hls_url := traverse_obj(data, ('video', 'url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(hls_url, video_id, m3u8_id='hls', fatal=False))
for http_format in traverse_obj(data, ('video', 'stream_urls', ..., 'stream_url', {url_or_none})):
formats.append({
'url': http_format,
'format_id': 'http',
**parse_resolution(http_format),
})
return {
'id': video_id,
**traverse_obj(data, {
'title': 'title',
'duration': ('video', 'duration', {int_or_none}),
'timestamp': ('upload_date', {unified_timestamp}),
}),
'formats': formats,
}

View File

@ -73,6 +73,7 @@ class MediasetIE(ThePlatformBaseIE):
'season_number': 5, 'season_number': 5,
'episode_number': 5, 'episode_number': 5,
'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}], 'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
'categories': ['Informazione'],
}, },
}, { }, {
# DRM # DRM
@ -149,6 +150,7 @@ class MediasetIE(ThePlatformBaseIE):
'season_number': 12, 'season_number': 12,
'episode': 'Episode 8', 'episode': 'Episode 8',
'episode_number': 8, 'episode_number': 8,
'categories': ['Intrattenimento'],
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,

View File

@ -53,6 +53,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'chapters': 'count:1', 'chapters': 'count:1',
'tags': 'count:4', 'tags': 'count:4',
'thumbnail': r're:https?://.+\.jpg', 'thumbnail': r're:https?://.+\.jpg',
'categories': ['Series/The Tonight Show Starring Jimmy Fallon'],
'media_type': 'Full Episode',
}, },
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',
@ -131,6 +133,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'tags': 'count:10', 'tags': 'count:10',
'age_limit': 0, 'age_limit': 0,
'thumbnail': r're:https?://.+\.jpg', 'thumbnail': r're:https?://.+\.jpg',
'categories': ['Series/Quantum Leap 2022'],
'media_type': 'Highlight',
}, },
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',

View File

@ -0,0 +1,33 @@
from .common import InfoExtractor
from ..utils import format_field, parse_iso8601
class RinseFMIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rinse\.fm/episodes/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://rinse.fm/episodes/club-glow-15-12-2023-2000/',
'md5': '76ee0b719315617df42e15e710f46c7b',
'info_dict': {
'id': '1536535',
'ext': 'mp3',
'title': 'Club Glow - 15/12/2023 - 20:00',
'thumbnail': r're:^https://.+\.(?:jpg|JPG)$',
'release_timestamp': 1702598400,
'release_date': '20231215'
}
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
entry = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['entry']
return {
'id': entry['id'],
'title': entry.get('title'),
'url': entry['fileUrl'],
'vcodec': 'none',
'release_timestamp': parse_iso8601(entry.get('episodeDate')),
'thumbnail': format_field(
entry, [('featuredImage', 0, 'filename')], 'https://rinse.imgix.net/media/%s', default=None),
}

View File

@ -1,6 +1,7 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
determine_ext,
js_to_json, js_to_json,
traverse_obj, traverse_obj,
update_url_query, update_url_query,
@ -29,6 +30,15 @@ class RudoVideoIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$', 'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
}, },
}, {
'url': 'https://rudo.video/podcast/b42ZUznHX0',
'md5': 'b91c70d832938871367f8ad10c895821',
'info_dict': {
'id': 'b42ZUznHX0',
'title': 'Columna Ruperto Concha',
'ext': 'mp3',
'thumbnail': r're:^(?:https?:)?//.*\.(png|jpg)$',
},
}, { }, {
'url': 'https://rudo.video/vod/bN5AaJ', 'url': 'https://rudo.video/vod/bN5AaJ',
'md5': '01324a329227e2591530ecb4f555c881', 'md5': '01324a329227e2591530ecb4f555c881',
@ -75,13 +85,13 @@ class RudoVideoIE(InfoExtractor):
if 'Streaming is not available in your area' in webpage: if 'Streaming is not available in your area' in webpage:
self.raise_geo_restricted() self.raise_geo_restricted()
m3u8_url = ( media_url = (
self._search_regex( self._search_regex(
r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'stream url', default=None) r'var\s+streamURL\s*=\s*[\'"]([^?\'"]+)', webpage, 'stream url', default=None)
# Source URL must be used only if streamURL is unavailable # Source URL must be used only if streamURL is unavailable
or self._search_regex( or self._search_regex(
r'<source[^>]+src=[\'"]([^\'"]+)', webpage, 'source url', default=None)) r'<source[^>]+src=[\'"]([^\'"]+)', webpage, 'source url', default=None))
if not m3u8_url: if not media_url:
youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube\.com[^\'"]+)', youtube_url = self._search_regex(r'file:\s*[\'"]((?:https?:)//(?:www\.)?youtube\.com[^\'"]+)',
webpage, 'youtube url', default=None) webpage, 'youtube url', default=None)
if youtube_url: if youtube_url:
@ -97,7 +107,12 @@ class RudoVideoIE(InfoExtractor):
raise ExtractorError('Invalid access token array') raise ExtractorError('Invalid access token array')
access_token = self._download_json( access_token = self._download_json(
token_url, video_id, note='Downloading access token')['data']['authToken'] token_url, video_id, note='Downloading access token')['data']['authToken']
m3u8_url = update_url_query(m3u8_url, {'auth-token': access_token}) media_url = update_url_query(media_url, {'auth-token': access_token})
if determine_ext(media_url) == 'm3u8':
formats = self._extract_m3u8_formats(media_url, video_id, live=is_live)
else:
formats = [{'url': media_url}]
return { return {
'id': video_id, 'id': video_id,
@ -109,6 +124,6 @@ class RudoVideoIE(InfoExtractor):
'thumbnail': (self._search_regex(r'var\s+posterIMG\s*=\s*[\'"]([^?\'"]+)', 'thumbnail': (self._search_regex(r'var\s+posterIMG\s*=\s*[\'"]([^?\'"]+)',
webpage, 'thumbnail', default=None) webpage, 'thumbnail', default=None)
or self._og_search_thumbnail(webpage)), or self._og_search_thumbnail(webpage)),
'formats': self._extract_m3u8_formats(m3u8_url, video_id, live=is_live), 'formats': formats,
'is_live': is_live, 'is_live': is_live,
} }

View File

@ -114,6 +114,8 @@ class ScrippsNetworksIE(InfoExtractor):
'timestamp': 1475678834, 'timestamp': 1475678834,
'upload_date': '20161005', 'upload_date': '20161005',
'uploader': 'SCNI-SCND', 'uploader': 'SCNI-SCND',
'tags': 'count:10',
'creator': 'Cooking Channel',
'duration': 29.995, 'duration': 29.995,
'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': '<Untitled Chapter 1>'}], 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': '<Untitled Chapter 1>'}],
'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg',

View File

@ -104,6 +104,10 @@ class ThePlatformBaseIE(OnceIE):
_add_chapter(chapter.get('startTime'), chapter.get('endTime')) _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
_add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
def extract_site_specific_field(field):
# A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
return { return {
'title': info['title'], 'title': info['title'],
'subtitles': subtitles, 'subtitles': subtitles,
@ -113,6 +117,14 @@ class ThePlatformBaseIE(OnceIE):
'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'), 'uploader': info.get('billingCode'),
'chapters': chapters, 'chapters': chapters,
'creator': traverse_obj(info, ('author', {str})) or None,
'categories': traverse_obj(info, (
'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
'location': extract_site_specific_field('region'),
'series': extract_site_specific_field('show'),
'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
} }
def _extract_theplatform_metadata(self, path, video_id): def _extract_theplatform_metadata(self, path, video_id):

View File

@ -479,9 +479,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 18, 'age_limit': 18,
'_old_archive_ids': ['twitter 643211948184596480'],
}, },
}, { }, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
@ -515,6 +515,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int, 'like_count': int,
'tags': ['TV', 'StarWars', 'TheForceAwakens'], 'tags': ['TV', 'StarWars', 'TheForceAwakens'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 665052190608723968'],
}, },
}, { }, {
'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
@ -558,9 +559,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': ['Damndaniel'], 'tags': ['Damndaniel'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 700207533655363584'],
}, },
}, { }, {
'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609',
@ -599,9 +600,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 719944021058060289'],
}, },
}, { }, {
'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384',
@ -616,6 +617,7 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
}, },
'add_ie': ['Periscope'], 'add_ie': ['Periscope'],
'skip': 'Broadcast not found',
}, { }, {
# has mp4 formats via mobile API # has mp4 formats via mobile API
'url': 'https://twitter.com/news_al3alm/status/852138619213144067', 'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
@ -635,9 +637,9 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'tags': [], 'tags': [],
'repost_count': int, 'repost_count': int,
'view_count': int,
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'_old_archive_ids': ['twitter 852138619213144067'],
}, },
}, { }, {
'url': 'https://twitter.com/i/web/status/910031516746514432', 'url': 'https://twitter.com/i/web/status/910031516746514432',
@ -657,9 +659,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': ['Maria'], 'tags': ['Maria'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 910031516746514432'],
}, },
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg
@ -683,9 +685,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1001551623938805763'],
}, },
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg
@ -749,6 +751,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int, 'like_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1349794411333394432'],
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -771,18 +774,18 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1577855540407197696'],
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': { 'info_dict': {
'id': '1577719286659006464', 'id': '1577719286659006464',
'title': 'Ultima📛| New Era - Test', 'title': 'Ultima - Test',
'description': 'Test https://t.co/Y3KEZD7Dad', 'description': 'Test https://t.co/Y3KEZD7Dad',
'uploader': 'Ultima📛| New Era', 'uploader': 'Ultima',
'uploader_id': 'UltimaShadowX', 'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005', 'upload_date': '20221005',
@ -813,9 +816,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': ['HurricaneIan'], 'tags': ['HurricaneIan'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1575560063510810624'],
}, },
}, { }, {
# Adult content, fails if not logged in # Adult content, fails if not logged in
@ -951,10 +954,10 @@ class TwitterIE(TwitterBaseIE):
'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'uploader_url': 'https://twitter.com/CTVJLaidlaw',
'display_id': '1600649710662213632', 'display_id': '1600649710662213632',
'like_count': int, 'like_count': int,
'view_count': int,
'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
'upload_date': '20221208', 'upload_date': '20221208',
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1600649710662213632'],
}, },
'params': {'noplaylist': True}, 'params': {'noplaylist': True},
}, { }, {
@ -979,7 +982,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
'view_count': int, '_old_archive_ids': ['twitter 1621117700482416640'],
}, },
}, { }, {
'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
@ -995,13 +998,13 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int, 'repost_count': int,
'duration': 9.531, 'duration': 9.531,
'comment_count': int, 'comment_count': int,
'view_count': int,
'upload_date': '20221203', 'upload_date': '20221203',
'age_limit': 0, 'age_limit': 0,
'timestamp': 1670092210.0, 'timestamp': 1670092210.0,
'tags': [], 'tags': [],
'uploader': '\u06ea', 'uploader': '\u06ea',
'description': '\U0001F48B https://t.co/bTj9Qz7vQP', 'description': '\U0001F48B https://t.co/bTj9Qz7vQP',
'_old_archive_ids': ['twitter 1599108751385972737'],
}, },
'params': {'noplaylist': True}, 'params': {'noplaylist': True},
}, { }, {
@ -1012,7 +1015,6 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'uploader_url': 'https://twitter.com/MunTheShinobi', 'uploader_url': 'https://twitter.com/MunTheShinobi',
'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
'view_count': int,
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0, 'age_limit': 0,
'uploader': 'Mün', 'uploader': 'Mün',
@ -1025,6 +1027,7 @@ class TwitterIE(TwitterBaseIE):
'uploader_id': 'MunTheShinobi', 'uploader_id': 'MunTheShinobi',
'duration': 139.987, 'duration': 139.987,
'timestamp': 1670306984.0, 'timestamp': 1670306984.0,
'_old_archive_ids': ['twitter 1600009574919962625'],
}, },
}, { }, {
# retweeted_status (private) # retweeted_status (private)
@ -1068,8 +1071,8 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'view_count': int,
'comment_count': int, 'comment_count': int,
'_old_archive_ids': ['twitter 1695424220702888009'],
}, },
}, { }, {
# retweeted_status w/ legacy API # retweeted_status w/ legacy API
@ -1091,18 +1094,24 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'_old_archive_ids': ['twitter 1695424220702888009'],
}, },
'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}},
}, { }, {
# Broadcast embedded in tweet # Broadcast embedded in tweet
'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', 'url': 'https://twitter.com/JessicaDobsonWX/status/1731121063248175384',
'info_dict': { 'info_dict': {
'id': '1yNGaNLjEblJj', 'id': '1rmxPMjLzAXKN',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', 'title': 'WAVE Weather Now - Saturday 12/2/23 Update',
'uploader': 'Jessica Dobson', 'uploader': 'Jessica Dobson',
'uploader_id': '1DZEoDwDovRQa', 'uploader_id': 'JessicaDobsonWX',
'thumbnail': r're:^https?://.*\.jpg', 'uploader_url': 'https://twitter.com/JessicaDobsonWX',
'timestamp': 1701566398,
'upload_date': '20231203',
'live_status': 'was_live',
'thumbnail': r're:https://[^/]+pscp\.tv/.+\.jpg',
'concurrent_view_count': int,
'view_count': int, 'view_count': int,
}, },
'add_ie': ['TwitterBroadcast'], 'add_ie': ['TwitterBroadcast'],
@ -1125,6 +1134,30 @@ class TwitterIE(TwitterBaseIE):
}, },
'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}},
'expected_warnings': ['Not all metadata'], 'expected_warnings': ['Not all metadata'],
}, {
# "stale tweet" with typename "TweetWithVisibilityResults"
'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154',
'md5': '62b1e11cdc2cdd0e527f83adb081f536',
'info_dict': {
'id': '1724883339285544960',
'ext': 'mp4',
'title': 'md5:cc56716f9ed0b368de2ba54c478e493c',
'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164',
'display_id': '1724884212803834154',
'uploader': 'Robert F. Kennedy Jr',
'uploader_id': 'RobertKennedyJr',
'uploader_url': 'https://twitter.com/RobertKennedyJr',
'upload_date': '20231115',
'timestamp': 1700079417.0,
'duration': 341.048,
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'tags': ['Kennedy24'],
'repost_count': int,
'like_count': int,
'comment_count': int,
'age_limit': 0,
'_old_archive_ids': ['twitter 1724884212803834154'],
},
}, { }, {
# onion route # onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@ -1179,19 +1212,23 @@ class TwitterIE(TwitterBaseIE):
), default={}, get_all=False) if self.is_logged_in else traverse_obj( ), default={}, get_all=False) if self.is_logged_in else traverse_obj(
data, ('tweetResult', 'result', {dict}), default={}) data, ('tweetResult', 'result', {dict}), default={})
if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): typename = result.get('__typename')
self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None):
self.report_warning(f'Unknown typename: {typename}', twid, only_once=True)
if 'tombstone' in result: if 'tombstone' in result:
cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more')
raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True)
elif result.get('__typename') == 'TweetUnavailable': elif typename == 'TweetUnavailable':
reason = result.get('reason') reason = result.get('reason')
if reason == 'NsfwLoggedOut': if reason == 'NsfwLoggedOut':
self.raise_login_required('NSFW tweet requires authentication') self.raise_login_required('NSFW tweet requires authentication')
elif reason == 'Protected': elif reason == 'Protected':
self.raise_login_required('You are not authorized to view this protected tweet') self.raise_login_required('You are not authorized to view this protected tweet')
raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True)
# Result for "stale tweet" needs additional transformation
elif typename == 'TweetWithVisibilityResults':
result = traverse_obj(result, ('tweet', {dict})) or {}
status = result.get('legacy', {}) status = result.get('legacy', {})
status.update(traverse_obj(result, { status.update(traverse_obj(result, {
@ -1377,7 +1414,7 @@ class TwitterIE(TwitterBaseIE):
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available
'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
# The codec of http formats are unknown # The codec of http formats are unknown
'_format_sort_fields': ('res', 'br', 'size', 'proto'), '_format_sort_fields': ('res', 'br', 'size', 'proto'),

View File

@ -4480,14 +4480,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if mobj: if mobj:
info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
break break
sbr_tooltip = try_get(
vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) info['like_count'] = traverse_obj(vpir, (
if sbr_tooltip: 'videoActions', 'menuRenderer', 'topLevelButtons', ...,
like_count, dislike_count = sbr_tooltip.split(' / ') 'segmentedLikeDislikeButtonViewModel', 'likeButtonViewModel', 'likeButtonViewModel',
info.update({ 'toggleButtonViewModel', 'toggleButtonViewModel', 'defaultButtonViewModel',
'like_count': str_to_int(like_count), 'buttonViewModel', 'accessibilityText', {parse_count}), get_all=False)
'dislike_count': str_to_int(dislike_count),
})
vcr = traverse_obj(vpir, ('viewCount', 'videoViewCountRenderer')) vcr = traverse_obj(vpir, ('viewCount', 'videoViewCountRenderer'))
if vcr: if vcr:
vc = self._get_count(vcr, 'viewCount') vc = self._get_count(vcr, 'viewCount')

View File

@ -67,7 +67,7 @@ class HTTPHeaderDict(collections.UserDict, dict):
def __setitem__(self, key, value): def __setitem__(self, key, value):
if isinstance(value, bytes): if isinstance(value, bytes):
value = value.decode('latin-1') value = value.decode('latin-1')
super().__setitem__(key.title(), str(value)) super().__setitem__(key.title(), str(value).strip())
def __getitem__(self, key): def __getitem__(self, key):
return super().__getitem__(key.title()) return super().__getitem__(key.title())

View File

@ -286,8 +286,8 @@ class CueBlock(Block):
m1 = parser.consume(_REGEX_TS) m1 = parser.consume(_REGEX_TS)
if not m1: if not m1:
return None return None
parser.consume(_REGEX_OPTIONAL_WHITESPACE)
m2 = parser.consume(cls._REGEX_SETTINGS) m2 = parser.consume(cls._REGEX_SETTINGS)
parser.consume(_REGEX_OPTIONAL_WHITESPACE)
if not parser.consume(_REGEX_NL): if not parser.consume(_REGEX_NL):
return None return None