Compare commits

...

2 Commits

Author SHA1 Message Date
Mozi
db8b4edc7d
[ie/JoqrAg] Add extractor (#8384)
Authored by: pzhlkj6612
2023-12-19 14:21:47 +00:00
bashonly
1c54a98e19
[ie/twitter] Extract stale tweets (#8724)
Closes #8691
Authored by: bashonly
2023-12-19 13:24:55 +00:00
3 changed files with 174 additions and 24 deletions

View File

@ -865,6 +865,7 @@ from .jiosaavn import (
) )
from .jove import JoveIE from .jove import JoveIE
from .joj import JojIE from .joj import JojIE
from .joqrag import JoqrAgIE
from .jstream import JStreamIE from .jstream import JStreamIE
from .jtbc import ( from .jtbc import (
JTBCIE, JTBCIE,

112
yt_dlp/extractor/joqrag.py Normal file
View File

@ -0,0 +1,112 @@
import datetime
import urllib.parse
from .common import InfoExtractor
from ..utils import (
clean_html,
datetime_from_str,
unified_timestamp,
urljoin,
)
class JoqrAgIE(InfoExtractor):
IE_DESC = '超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR)'
_VALID_URL = [r'https?://www\.uniqueradio\.jp/agplayer5/(?:player|inc-player-hls)\.php',
r'https?://(?:www\.)?joqr\.co\.jp/ag/',
r'https?://(?:www\.)?joqr\.co\.jp/qr/ag(?:daily|regular)program/?(?:$|[#?])']
_TESTS = [{
'url': 'https://www.uniqueradio.jp/agplayer5/player.php',
'info_dict': {
'id': 'live',
'title': str,
'channel': '超!A&G+',
'description': str,
'live_status': 'is_live',
'release_timestamp': int,
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}, {
'url': 'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php',
'only_matching': True,
}, {
'url': 'https://www.joqr.co.jp/ag/article/103760/',
'only_matching': True,
}, {
'url': 'http://www.joqr.co.jp/qr/agdailyprogram/',
'only_matching': True,
}, {
'url': 'http://www.joqr.co.jp/qr/agregularprogram/',
'only_matching': True,
}]
def _extract_metadata(self, variable, html):
return clean_html(urllib.parse.unquote_plus(self._search_regex(
rf'var\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
html, 'metadata', group='value', default=''))) or None
def _extract_start_timestamp(self, video_id, is_live):
def extract_start_time_from(date_str):
dt = datetime_from_str(date_str) + datetime.timedelta(hours=9)
date = dt.strftime('%Y%m%d')
start_time = self._search_regex(
r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+\s*(\d{1,2}:\d{1,2})',
self._download_webpage(
f'https://www.joqr.co.jp/qr/agdailyprogram/?date={date}', video_id,
note=f'Downloading program list of {date}', fatal=False,
errnote=f'Failed to download program list of {date}') or '',
'start time', default=None)
if start_time:
return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00')
return None
start_timestamp = extract_start_time_from('today')
if not start_timestamp:
return None
if not is_live or start_timestamp < datetime_from_str('now').timestamp():
return start_timestamp
else:
return extract_start_time_from('yesterday')
def _real_extract(self, url):
video_id = 'live'
metadata = self._download_webpage(
'https://www.uniqueradio.jp/aandg', video_id,
note='Downloading metadata', errnote='Failed to download metadata')
title = self._extract_metadata('Program_name', metadata)
if title == '放送休止':
formats = []
live_status = 'is_upcoming'
release_timestamp = self._extract_start_timestamp(video_id, False)
msg = 'This stream is not currently live'
if release_timestamp:
msg += (' and will start at '
+ datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S'))
self.raise_no_formats(msg, expected=True)
else:
m3u8_path = self._search_regex(
r'<source\s[^>]*\bsrc="([^"]+)"',
self._download_webpage(
'https://www.uniqueradio.jp/agplayer5/inc-player-hls.php', video_id,
note='Downloading player data', errnote='Failed to download player data'),
'm3u8 url')
formats = self._extract_m3u8_formats(
urljoin('https://www.uniqueradio.jp/', m3u8_path), video_id)
live_status = 'is_live'
release_timestamp = self._extract_start_timestamp(video_id, True)
return {
'id': video_id,
'title': title,
'channel': '超!A&G+',
'description': self._extract_metadata('Program_text', metadata),
'formats': formats,
'live_status': live_status,
'release_timestamp': release_timestamp,
}

View File

@ -479,9 +479,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 18, 'age_limit': 18,
'_old_archive_ids': ['twitter 643211948184596480'],
}, },
}, { }, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
@ -515,6 +515,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int, 'like_count': int,
'tags': ['TV', 'StarWars', 'TheForceAwakens'], 'tags': ['TV', 'StarWars', 'TheForceAwakens'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 665052190608723968'],
}, },
}, { }, {
'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
@ -558,9 +559,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': ['Damndaniel'], 'tags': ['Damndaniel'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 700207533655363584'],
}, },
}, { }, {
'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609',
@ -599,9 +600,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 719944021058060289'],
}, },
}, { }, {
'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384',
@ -616,6 +617,7 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
}, },
'add_ie': ['Periscope'], 'add_ie': ['Periscope'],
'skip': 'Broadcast not found',
}, { }, {
# has mp4 formats via mobile API # has mp4 formats via mobile API
'url': 'https://twitter.com/news_al3alm/status/852138619213144067', 'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
@ -635,9 +637,9 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:^https?://.*\.jpg', 'thumbnail': r're:^https?://.*\.jpg',
'tags': [], 'tags': [],
'repost_count': int, 'repost_count': int,
'view_count': int,
'like_count': int, 'like_count': int,
'comment_count': int, 'comment_count': int,
'_old_archive_ids': ['twitter 852138619213144067'],
}, },
}, { }, {
'url': 'https://twitter.com/i/web/status/910031516746514432', 'url': 'https://twitter.com/i/web/status/910031516746514432',
@ -657,9 +659,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': ['Maria'], 'tags': ['Maria'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 910031516746514432'],
}, },
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg
@ -683,9 +685,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1001551623938805763'],
}, },
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg
@ -749,6 +751,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int, 'like_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1349794411333394432'],
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -771,18 +774,18 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': [], 'tags': [],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1577855540407197696'],
}, },
'params': {'skip_download': True}, 'params': {'skip_download': True},
}, { }, {
'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
'info_dict': { 'info_dict': {
'id': '1577719286659006464', 'id': '1577719286659006464',
'title': 'Ultima📛| New Era - Test', 'title': 'Ultima - Test',
'description': 'Test https://t.co/Y3KEZD7Dad', 'description': 'Test https://t.co/Y3KEZD7Dad',
'uploader': 'Ultima📛| New Era', 'uploader': 'Ultima',
'uploader_id': 'UltimaShadowX', 'uploader_id': 'UltimaShadowX',
'uploader_url': 'https://twitter.com/UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX',
'upload_date': '20221005', 'upload_date': '20221005',
@ -813,9 +816,9 @@ class TwitterIE(TwitterBaseIE):
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int, 'like_count': int,
'view_count': int,
'tags': ['HurricaneIan'], 'tags': ['HurricaneIan'],
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1575560063510810624'],
}, },
}, { }, {
# Adult content, fails if not logged in # Adult content, fails if not logged in
@ -951,10 +954,10 @@ class TwitterIE(TwitterBaseIE):
'uploader_url': 'https://twitter.com/CTVJLaidlaw', 'uploader_url': 'https://twitter.com/CTVJLaidlaw',
'display_id': '1600649710662213632', 'display_id': '1600649710662213632',
'like_count': int, 'like_count': int,
'view_count': int,
'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
'upload_date': '20221208', 'upload_date': '20221208',
'age_limit': 0, 'age_limit': 0,
'_old_archive_ids': ['twitter 1600649710662213632'],
}, },
'params': {'noplaylist': True}, 'params': {'noplaylist': True},
}, { }, {
@ -979,7 +982,7 @@ class TwitterIE(TwitterBaseIE):
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'comment_count': int, 'comment_count': int,
'view_count': int, '_old_archive_ids': ['twitter 1621117700482416640'],
}, },
}, { }, {
'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', 'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2',
@ -995,13 +998,13 @@ class TwitterIE(TwitterBaseIE):
'repost_count': int, 'repost_count': int,
'duration': 9.531, 'duration': 9.531,
'comment_count': int, 'comment_count': int,
'view_count': int,
'upload_date': '20221203', 'upload_date': '20221203',
'age_limit': 0, 'age_limit': 0,
'timestamp': 1670092210.0, 'timestamp': 1670092210.0,
'tags': [], 'tags': [],
'uploader': '\u06ea', 'uploader': '\u06ea',
'description': '\U0001F48B https://t.co/bTj9Qz7vQP', 'description': '\U0001F48B https://t.co/bTj9Qz7vQP',
'_old_archive_ids': ['twitter 1599108751385972737'],
}, },
'params': {'noplaylist': True}, 'params': {'noplaylist': True},
}, { }, {
@ -1012,7 +1015,6 @@ class TwitterIE(TwitterBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'uploader_url': 'https://twitter.com/MunTheShinobi', 'uploader_url': 'https://twitter.com/MunTheShinobi',
'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', 'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml',
'view_count': int,
'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',
'age_limit': 0, 'age_limit': 0,
'uploader': 'Mün', 'uploader': 'Mün',
@ -1025,6 +1027,7 @@ class TwitterIE(TwitterBaseIE):
'uploader_id': 'MunTheShinobi', 'uploader_id': 'MunTheShinobi',
'duration': 139.987, 'duration': 139.987,
'timestamp': 1670306984.0, 'timestamp': 1670306984.0,
'_old_archive_ids': ['twitter 1600009574919962625'],
}, },
}, { }, {
# retweeted_status (private) # retweeted_status (private)
@ -1068,8 +1071,8 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'view_count': int,
'comment_count': int, 'comment_count': int,
'_old_archive_ids': ['twitter 1695424220702888009'],
}, },
}, { }, {
# retweeted_status w/ legacy API # retweeted_status w/ legacy API
@ -1091,18 +1094,24 @@ class TwitterIE(TwitterBaseIE):
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'like_count': int, 'like_count': int,
'repost_count': int, 'repost_count': int,
'_old_archive_ids': ['twitter 1695424220702888009'],
}, },
'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}},
}, { }, {
# Broadcast embedded in tweet # Broadcast embedded in tweet
'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', 'url': 'https://twitter.com/JessicaDobsonWX/status/1731121063248175384',
'info_dict': { 'info_dict': {
'id': '1yNGaNLjEblJj', 'id': '1rmxPMjLzAXKN',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', 'title': 'WAVE Weather Now - Saturday 12/2/23 Update',
'uploader': 'Jessica Dobson', 'uploader': 'Jessica Dobson',
'uploader_id': '1DZEoDwDovRQa', 'uploader_id': 'JessicaDobsonWX',
'thumbnail': r're:^https?://.*\.jpg', 'uploader_url': 'https://twitter.com/JessicaDobsonWX',
'timestamp': 1701566398,
'upload_date': '20231203',
'live_status': 'was_live',
'thumbnail': r're:https://[^/]+pscp\.tv/.+\.jpg',
'concurrent_view_count': int,
'view_count': int, 'view_count': int,
}, },
'add_ie': ['TwitterBroadcast'], 'add_ie': ['TwitterBroadcast'],
@ -1125,6 +1134,30 @@ class TwitterIE(TwitterBaseIE):
}, },
'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}},
'expected_warnings': ['Not all metadata'], 'expected_warnings': ['Not all metadata'],
}, {
# "stale tweet" with typename "TweetWithVisibilityResults"
'url': 'https://twitter.com/RobertKennedyJr/status/1724884212803834154',
'md5': '62b1e11cdc2cdd0e527f83adb081f536',
'info_dict': {
'id': '1724883339285544960',
'ext': 'mp4',
'title': 'md5:cc56716f9ed0b368de2ba54c478e493c',
'description': 'md5:9dc14f5b0f1311fc7caf591ae253a164',
'display_id': '1724884212803834154',
'uploader': 'Robert F. Kennedy Jr',
'uploader_id': 'RobertKennedyJr',
'uploader_url': 'https://twitter.com/RobertKennedyJr',
'upload_date': '20231115',
'timestamp': 1700079417.0,
'duration': 341.048,
'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
'tags': ['Kennedy24'],
'repost_count': int,
'like_count': int,
'comment_count': int,
'age_limit': 0,
'_old_archive_ids': ['twitter 1724884212803834154'],
},
}, { }, {
# onion route # onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
@ -1179,19 +1212,23 @@ class TwitterIE(TwitterBaseIE):
), default={}, get_all=False) if self.is_logged_in else traverse_obj( ), default={}, get_all=False) if self.is_logged_in else traverse_obj(
data, ('tweetResult', 'result', {dict}), default={}) data, ('tweetResult', 'result', {dict}), default={})
if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): typename = result.get('__typename')
self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) if typename not in ('Tweet', 'TweetWithVisibilityResults', 'TweetTombstone', 'TweetUnavailable', None):
self.report_warning(f'Unknown typename: {typename}', twid, only_once=True)
if 'tombstone' in result: if 'tombstone' in result:
cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more')
raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True)
elif result.get('__typename') == 'TweetUnavailable': elif typename == 'TweetUnavailable':
reason = result.get('reason') reason = result.get('reason')
if reason == 'NsfwLoggedOut': if reason == 'NsfwLoggedOut':
self.raise_login_required('NSFW tweet requires authentication') self.raise_login_required('NSFW tweet requires authentication')
elif reason == 'Protected': elif reason == 'Protected':
self.raise_login_required('You are not authorized to view this protected tweet') self.raise_login_required('You are not authorized to view this protected tweet')
raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True)
# Result for "stale tweet" needs additional transformation
elif typename == 'TweetWithVisibilityResults':
result = traverse_obj(result, ('tweet', {dict})) or {}
status = result.get('legacy', {}) status = result.get('legacy', {})
status.update(traverse_obj(result, { status.update(traverse_obj(result, {
@ -1377,7 +1414,7 @@ class TwitterIE(TwitterBaseIE):
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available
'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
# The codec of http formats are unknown # The codec of http formats are unknown
'_format_sort_fields': ('res', 'br', 'size', 'proto'), '_format_sort_fields': ('res', 'br', 'size', 'proto'),