Compare commits

..

No commits in common. "91ca560381165969194fb04e764782d720816f74" and "0eb370d3cd1071bdef32e82cfb05ee87e6f6fe2a" have entirely different histories.

7 changed files with 72 additions and 127 deletions

View File

@ -140,8 +140,6 @@ class TestFormatSelection(unittest.TestCase):
test('example-with-dashes', 'example-with-dashes') test('example-with-dashes', 'example-with-dashes')
test('all', '2', '47', '45', 'example-with-dashes', '35') test('all', '2', '47', '45', 'example-with-dashes', '35')
test('mergeall', '2+47+45+example-with-dashes+35', multi=True) test('mergeall', '2+47+45+example-with-dashes+35', multi=True)
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
test('7_a/worst', '35')
def test_format_selection_audio(self): def test_format_selection_audio(self):
formats = [ formats = [

View File

@ -2465,16 +2465,9 @@ class YoutubeDL:
return selector_function(ctx_copy) return selector_function(ctx_copy)
return final_selector return final_selector
# HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid stream = io.BytesIO(format_spec.encode())
# Prefix numbers with random letters to avoid it being classified as a number
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
# TODO: Implement parser not reliant on tokenize.tokenize
prefix = ''.join(random.choices(string.ascii_letters, k=32))
stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
try: try:
tokens = list(_remove_unused_ops( tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
token._replace(string=token.string.replace(prefix, ''))
for token in tokenize.tokenize(stream.readline)))
except tokenize.TokenError: except tokenize.TokenError:
raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))

View File

@ -292,7 +292,7 @@ class ARDIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
# available till 7.12.2023 # available till 7.12.2023
'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
'md5': '94812e6438488fb923c361a44469614b', 'md5': 'a438f671e87a7eba04000336a119ccc4',
'info_dict': { 'info_dict': {
'id': 'maischberger-video-424', 'id': 'maischberger-video-424',
'display_id': 'maischberger-video-424', 'display_id': 'maischberger-video-424',
@ -403,25 +403,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
_VALID_URL = r'''(?x)https:// _VALID_URL = r'''(?x)https://
(?:(?:beta|www)\.)?ardmediathek\.de/ (?:(?:beta|www)\.)?ardmediathek\.de/
(?:(?P<client>[^/]+)/)? (?:(?P<client>[^/]+)/)?
(?:player|live|video|(?P<playlist>sendung|serie|sammlung))/ (?:player|live|video|(?P<playlist>sendung|sammlung))/
(?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)? (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))''' (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
_TESTS = [{ _TESTS = [{
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0', 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI',
'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4', 'md5': '3fd5fead7a370a819341129c8d713136',
'info_dict': { 'info_dict': {
'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen', 'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen',
'id': '12939099', 'id': '12172961',
'title': 'Liebe auf vier Pfoten', 'title': 'Wolfsland - Die traurigen Schwestern',
'description': r're:^Claudia Schmitt, Anwältin in Salzburg', 'description': r're:^Als der Polizeiobermeister Raaben',
'duration': 5222, 'duration': 5241,
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b', 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957',
'timestamp': 1701343800, 'timestamp': 1670710500,
'upload_date': '20231130', 'upload_date': '20221210',
'ext': 'mp4', 'ext': 'mp4',
'episode': 'Liebe auf vier Pfoten', 'age_limit': 12,
'episode': 'Wolfsland - Die traurigen Schwestern',
'series': 'Filme im MDR' 'series': 'Filme im MDR'
}, },
}, { }, {
@ -453,7 +454,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'duration': 915, 'duration': 915,
'episode': 'tagesschau, 20:00 Uhr', 'episode': 'tagesschau, 20:00 Uhr',
'series': 'tagesschau', 'series': 'tagesschau',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678', 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49',
}, },
}, { }, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -474,10 +475,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
# playlist of type 'sendung' # playlist of type 'sendung'
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/', 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
'only_matching': True, 'only_matching': True,
}, {
# playlist of type 'serie'
'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
'only_matching': True,
}, { }, {
# playlist of type 'sammlung' # playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
@ -490,11 +487,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number): def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
""" Query the ARD server for playlist information """ Query the ARD server for playlist information
and returns the data in "raw" format """ and returns the data in "raw" format """
assert mode in ('sendung', 'serie', 'sammlung') if mode == 'sendung':
if mode in ('sendung', 'serie'):
graphQL = json.dumps({ graphQL = json.dumps({
'query': '''{ 'query': '''{
showPage( showPage(
@ -511,7 +507,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
links { target { id href title } } links { target { id href title } }
type type
} }
}}''' % (client, playlist_id, page_number), }}''' % (client, playlist_id, pageNumber),
}).encode() }).encode()
else: # mode == 'sammlung' else: # mode == 'sammlung'
graphQL = json.dumps({ graphQL = json.dumps({
@ -532,7 +528,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
type type
} }
} }
}}''' % (client, playlist_id, page_number), }}''' % (client, playlist_id, pageNumber),
}).encode() }).encode()
# Ressources for ARD graphQL debugging: # Ressources for ARD graphQL debugging:
# https://api-test.ardmediathek.de/public-gateway # https://api-test.ardmediathek.de/public-gateway
@ -542,7 +538,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
data=graphQL, data=graphQL,
headers={'Content-Type': 'application/json'})['data'] headers={'Content-Type': 'application/json'})['data']
# align the structure of the returned data: # align the structure of the returned data:
if mode in ('sendung', 'serie'): if mode == 'sendung':
show_page = show_page['showPage'] show_page = show_page['showPage']
else: # mode == 'sammlung' else: # mode == 'sammlung'
show_page = show_page['morePage']['widget'] show_page = show_page['morePage']['widget']
@ -550,12 +546,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode): def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
""" Collects all playlist entries and returns them as info dict. """ Collects all playlist entries and returns them as info dict.
Supports playlists of mode 'sendung', 'serie', and 'sammlung', Supports playlists of mode 'sendung' and 'sammlung', and also nested
as well as nested playlists. """ playlists. """
entries = [] entries = []
pageNumber = 0 pageNumber = 0
while True: # iterate by pageNumber while True: # iterate by pageNumber
show_page = self._ARD_load_playlist_snippet( show_page = self._ARD_load_playlist_snipped(
playlist_id, display_id, client, mode, pageNumber) playlist_id, display_id, client, mode, pageNumber)
for teaser in show_page['teasers']: # process playlist items for teaser in show_page['teasers']: # process playlist items
if '/compilation/' in teaser['links']['target']['href']: if '/compilation/' in teaser['links']['target']['href']:

View File

@ -52,7 +52,7 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=| )\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?| [^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/| [^/]+/posts/|
groups/[^/]+/(?:permalink|posts)/| groups/[^/]+/permalink/|
watchparty/ watchparty/
)| )|
facebook: facebook:
@ -232,21 +232,6 @@ class FacebookIE(InfoExtractor):
'uploader_id': '100013949973717', 'uploader_id': '100013949973717',
}, },
'skip': 'Requires logging in', 'skip': 'Requires logging in',
}, {
# data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
'info_dict': {
'id': '1569199726448814',
'ext': 'mp4',
'title': 'Pence MUST GO!',
'description': 'Vickie Gentry shared a memory.',
'timestamp': 1511548260,
'upload_date': '20171124',
'uploader': 'Vickie Gentry',
'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
'thumbnail': r're:^https?://.*',
'duration': 148.435,
},
}, { }, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True, 'only_matching': True,
@ -627,11 +612,9 @@ class FacebookIE(InfoExtractor):
nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
attachments = traverse_obj(nodes, ( attachments = traverse_obj(nodes, (
..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')), ..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or []
'attachment', {dict}))
for attachment in attachments: for attachment in attachments:
ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}), ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
('target', 'attachments', ..., 'styles', 'attachment', {dict}))
for n in ns: for n in ns:
parse_attachment(n) parse_attachment(n)
parse_attachment(attachment) parse_attachment(attachment)
@ -654,7 +637,7 @@ class FacebookIE(InfoExtractor):
if len(entries) > 1: if len(entries) > 1:
return self.playlist_result(entries, video_id) return self.playlist_result(entries, video_id)
video_info = entries[0] if entries else {'id': video_id} video_info = entries[0]
webpage_info = extract_metadata(webpage) webpage_info = extract_metadata(webpage)
# honor precise duration in video info # honor precise duration in video info
if video_info.get('duration'): if video_info.get('duration'):

View File

@ -10,7 +10,6 @@ from ..utils import (
ExtractorError, ExtractorError,
decode_base_n, decode_base_n,
encode_base_n, encode_base_n,
filter_dict,
float_or_none, float_or_none,
format_field, format_field,
get_element_by_attribute, get_element_by_attribute,
@ -704,31 +703,28 @@ class InstagramStoryIE(InstagramBaseIE):
user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False)
if not user_info: if not user_info:
self.raise_login_required('This content is unreachable') self.raise_login_required('This content is unreachable')
user_id = user_info.get('id')
user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str)
story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}'
if not story_info_url: # user id is only mandatory for non-highlights
raise ExtractorError('Unable to extract user id')
videos = traverse_obj(self._download_json( videos = traverse_obj(self._download_json(
f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}',
story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels')
if not videos: if not videos:
self.raise_login_required('You need to log in to access this content') self.raise_login_required('You need to log in to access this content')
full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name')) full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name'))
story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title'))
if not story_title: if not story_title:
story_title = f'Story by {username}' story_title = f'Story by {username}'
highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items')) highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items'))
info_data = [] info_data = []
for highlight in highlights: for highlight in highlights:
highlight_data = self._extract_product(highlight) highlight_data = self._extract_product(highlight)
if highlight_data.get('formats'): if highlight_data.get('formats'):
info_data.append({ info_data.append({
**highlight_data,
'uploader': full_name, 'uploader': full_name,
'uploader_id': user_id, 'uploader_id': user_id,
**filter_dict(highlight_data),
}) })
return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title)

View File

@ -6,7 +6,6 @@ from ..utils import (
int_or_none, int_or_none,
smuggle_url, smuggle_url,
traverse_obj, traverse_obj,
try_call,
unsmuggle_url, unsmuggle_url,
) )
@ -97,22 +96,13 @@ class LiTVIE(InfoExtractor):
r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
webpage, 'video data', default='{}'), video_id) webpage, 'video data', default='{}'), video_id)
if not video_data: if not video_data:
payload = {'assetId': program_info['assetId']} payload = {
puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value) 'assetId': program_info['assetId'],
if puid: 'watchDevices': program_info['watchDevices'],
payload.update({ 'contentType': program_info['contentType'],
'type': 'auth', }
'puid': puid,
})
endpoint = 'getUrl'
else:
payload.update({
'watchDevices': program_info['watchDevices'],
'contentType': program_info['contentType'],
})
endpoint = 'getMainUrlNoAuth'
video_data = self._download_json( video_data = self._download_json(
f'https://www.litv.tv/vod/ajax/{endpoint}', video_id, 'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id,
data=json.dumps(payload).encode('utf-8'), data=json.dumps(payload).encode('utf-8'),
headers={'Content-Type': 'application/json'}) headers={'Content-Type': 'application/json'})

View File

@ -10,7 +10,6 @@ from ..compat import (
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
) )
from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
dict_get, dict_get,
@ -1318,51 +1317,41 @@ class TwitterIE(TwitterBaseIE):
} }
} }
def _call_syndication_api(self, twid):
self.report_warning(
'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
status = self._download_json(
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
headers={'User-Agent': 'Googlebot'}, query={
'id': twid,
# TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
})
if not status:
raise ExtractorError('Syndication endpoint returned empty JSON response')
# Transform the result so its structure matches that of legacy/graphql
media = []
for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
detail['id_str'] = traverse_obj(detail, (
'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
media.append(detail)
status['extended_entities'] = {'media': media}
return status
def _extract_status(self, twid): def _extract_status(self, twid):
if self._selected_api not in ('graphql', 'legacy', 'syndication'): if self.is_logged_in or self._selected_api == 'graphql':
raise ExtractorError(f'{self._selected_api!r} is not a valid API selection', expected=True) status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
try: elif self._selected_api == 'legacy':
if self.is_logged_in or self._selected_api == 'graphql': status = self._call_api(f'statuses/show/{twid}.json', twid, {
status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid) 'cards_platform': 'Web-12',
elif self._selected_api == 'legacy': 'include_cards': 1,
status = self._call_api(f'statuses/show/{twid}.json', twid, { 'include_reply_count': 1,
'cards_platform': 'Web-12', 'include_user_entities': 0,
'include_cards': 1, 'tweet_mode': 'extended',
'include_reply_count': 1, })
'include_user_entities': 0,
'tweet_mode': 'extended', elif self._selected_api == 'syndication':
self.report_warning(
'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
status = self._download_json(
'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
headers={'User-Agent': 'Googlebot'}, query={
'id': twid,
# TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
}) })
except ExtractorError as e: if not status:
if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: raise ExtractorError('Syndication endpoint returned empty JSON response')
raise # Transform the result so its structure matches that of legacy/graphql
self.report_warning('Rate-limit exceeded; falling back to syndication endpoint') media = []
status = self._call_syndication_api(twid) for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
detail['id_str'] = traverse_obj(detail, (
'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
media.append(detail)
status['extended_entities'] = {'media': media}
if self._selected_api == 'syndication': else:
status = self._call_syndication_api(twid) raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)
return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {} return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}
@ -1427,8 +1416,8 @@ class TwitterIE(TwitterBaseIE):
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available
'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
# Prioritize m3u8 formats for compat, see https://github.com/yt-dlp/yt-dlp/issues/8117 # The codec of http formats are unknown
'_format_sort_fields': ('res', 'proto:m3u8', 'br', 'size'), # http format codec is unknown '_format_sort_fields': ('res', 'br', 'size', 'proto'),
} }
def extract_from_card_info(card): def extract_from_card_info(card):