Compare commits

..

No commits in common. "91ca560381165969194fb04e764782d720816f74" and "0eb370d3cd1071bdef32e82cfb05ee87e6f6fe2a" have entirely different histories.

7 changed files with 72 additions and 127 deletions

View File

@ -140,8 +140,6 @@ class TestFormatSelection(unittest.TestCase):
test('example-with-dashes', 'example-with-dashes')
test('all', '2', '47', '45', 'example-with-dashes', '35')
test('mergeall', '2+47+45+example-with-dashes+35', multi=True)
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
test('7_a/worst', '35')
def test_format_selection_audio(self):
formats = [

View File

@ -2465,16 +2465,9 @@ class YoutubeDL:
return selector_function(ctx_copy)
return final_selector
# HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid
# Prefix numbers with random letters to avoid it being classified as a number
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
# TODO: Implement parser not reliant on tokenize.tokenize
prefix = ''.join(random.choices(string.ascii_letters, k=32))
stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
stream = io.BytesIO(format_spec.encode())
try:
tokens = list(_remove_unused_ops(
token._replace(string=token.string.replace(prefix, ''))
for token in tokenize.tokenize(stream.readline)))
tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline)))
except tokenize.TokenError:
raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))

View File

@ -292,7 +292,7 @@ class ARDIE(InfoExtractor):
_TESTS = [{
# available till 7.12.2023
'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
'md5': '94812e6438488fb923c361a44469614b',
'md5': 'a438f671e87a7eba04000336a119ccc4',
'info_dict': {
'id': 'maischberger-video-424',
'display_id': 'maischberger-video-424',
@ -403,25 +403,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
_VALID_URL = r'''(?x)https://
(?:(?:beta|www)\.)?ardmediathek\.de/
(?:(?P<client>[^/]+)/)?
(?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
(?:player|live|video|(?P<playlist>sendung|sammlung))/
(?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
_TESTS = [{
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI',
'md5': '3fd5fead7a370a819341129c8d713136',
'info_dict': {
'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
'id': '12939099',
'title': 'Liebe auf vier Pfoten',
'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
'duration': 5222,
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
'timestamp': 1701343800,
'upload_date': '20231130',
'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen',
'id': '12172961',
'title': 'Wolfsland - Die traurigen Schwestern',
'description': r're:^Als der Polizeiobermeister Raaben',
'duration': 5241,
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957',
'timestamp': 1670710500,
'upload_date': '20221210',
'ext': 'mp4',
'episode': 'Liebe auf vier Pfoten',
'age_limit': 12,
'episode': 'Wolfsland - Die traurigen Schwestern',
'series': 'Filme im MDR'
},
}, {
@ -453,7 +454,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'duration': 915,
'episode': 'tagesschau, 20:00 Uhr',
'series': 'tagesschau',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49',
},
}, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -474,10 +475,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
# playlist of type 'sendung'
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
'only_matching': True,
}, {
# playlist of type 'serie'
'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
'only_matching': True,
}, {
# playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
@ -490,11 +487,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'only_matching': True,
}]
def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
""" Query the ARD server for playlist information
and returns the data in "raw" format """
assert mode in ('sendung', 'serie', 'sammlung')
if mode in ('sendung', 'serie'):
if mode == 'sendung':
graphQL = json.dumps({
'query': '''{
showPage(
@ -511,7 +507,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
links { target { id href title } }
type
}
}}''' % (client, playlist_id, page_number),
}}''' % (client, playlist_id, pageNumber),
}).encode()
else: # mode == 'sammlung'
graphQL = json.dumps({
@ -532,7 +528,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
type
}
}
}}''' % (client, playlist_id, page_number),
}}''' % (client, playlist_id, pageNumber),
}).encode()
# Ressources for ARD graphQL debugging:
# https://api-test.ardmediathek.de/public-gateway
@ -542,7 +538,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
data=graphQL,
headers={'Content-Type': 'application/json'})['data']
# align the structure of the returned data:
if mode in ('sendung', 'serie'):
if mode == 'sendung':
show_page = show_page['showPage']
else: # mode == 'sammlung'
show_page = show_page['morePage']['widget']
@ -550,12 +546,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
""" Collects all playlist entries and returns them as info dict.
Supports playlists of mode 'sendung', 'serie', and 'sammlung',
as well as nested playlists. """
Supports playlists of mode 'sendung' and 'sammlung', and also nested
playlists. """
entries = []
pageNumber = 0
while True: # iterate by pageNumber
show_page = self._ARD_load_playlist_snippet(
show_page = self._ARD_load_playlist_snipped(
playlist_id, display_id, client, mode, pageNumber)
for teaser in show_page['teasers']: # process playlist items
if '/compilation/' in teaser['links']['target']['href']:

View File

@ -52,7 +52,7 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/|
groups/[^/]+/(?:permalink|posts)/|
groups/[^/]+/permalink/|
watchparty/
)|
facebook:
@ -232,21 +232,6 @@ class FacebookIE(InfoExtractor):
'uploader_id': '100013949973717',
},
'skip': 'Requires logging in',
}, {
# data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
'info_dict': {
'id': '1569199726448814',
'ext': 'mp4',
'title': 'Pence MUST GO!',
'description': 'Vickie Gentry shared a memory.',
'timestamp': 1511548260,
'upload_date': '20171124',
'uploader': 'Vickie Gentry',
'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
'thumbnail': r're:^https?://.*',
'duration': 148.435,
},
}, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
@ -627,11 +612,9 @@ class FacebookIE(InfoExtractor):
nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
attachments = traverse_obj(nodes, (
..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
'attachment', {dict}))
..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or []
for attachment in attachments:
ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
('target', 'attachments', ..., 'styles', 'attachment', {dict}))
ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
for n in ns:
parse_attachment(n)
parse_attachment(attachment)
@ -654,7 +637,7 @@ class FacebookIE(InfoExtractor):
if len(entries) > 1:
return self.playlist_result(entries, video_id)
video_info = entries[0] if entries else {'id': video_id}
video_info = entries[0]
webpage_info = extract_metadata(webpage)
# honor precise duration in video info
if video_info.get('duration'):

View File

@ -10,7 +10,6 @@ from ..utils import (
ExtractorError,
decode_base_n,
encode_base_n,
filter_dict,
float_or_none,
format_field,
get_element_by_attribute,
@ -704,31 +703,28 @@ class InstagramStoryIE(InstagramBaseIE):
user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False)
if not user_info:
self.raise_login_required('This content is unreachable')
user_id = user_info.get('id')
user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str)
story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}'
if not story_info_url: # user id is only mandatory for non-highlights
raise ExtractorError('Unable to extract user id')
videos = traverse_obj(self._download_json(
f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}',
story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels')
if not videos:
self.raise_login_required('You need to log in to access this content')
full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name'))
full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name'))
story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title'))
if not story_title:
story_title = f'Story by {username}'
highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items'))
highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items'))
info_data = []
for highlight in highlights:
highlight_data = self._extract_product(highlight)
if highlight_data.get('formats'):
info_data.append({
**highlight_data,
'uploader': full_name,
'uploader_id': user_id,
**filter_dict(highlight_data),
})
return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title)

View File

@ -6,7 +6,6 @@ from ..utils import (
int_or_none,
smuggle_url,
traverse_obj,
try_call,
unsmuggle_url,
)
@ -97,22 +96,13 @@ class LiTVIE(InfoExtractor):
r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
webpage, 'video data', default='{}'), video_id)
if not video_data:
payload = {'assetId': program_info['assetId']}
puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
if puid:
payload.update({
'type': 'auth',
'puid': puid,
})
endpoint = 'getUrl'
else:
payload.update({
payload = {
'assetId': program_info['assetId'],
'watchDevices': program_info['watchDevices'],
'contentType': program_info['contentType'],
})
endpoint = 'getMainUrlNoAuth'
}
video_data = self._download_json(
f'https://www.litv.tv/vod/ajax/{endpoint}', video_id,
'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id,
data=json.dumps(payload).encode('utf-8'),
headers={'Content-Type': 'application/json'})

View File

@ -10,7 +10,6 @@ from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
dict_get,
@ -1318,7 +1317,20 @@ class TwitterIE(TwitterBaseIE):
}
}
def _call_syndication_api(self, twid):
def _extract_status(self, twid):
if self.is_logged_in or self._selected_api == 'graphql':
status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
elif self._selected_api == 'legacy':
status = self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
})
elif self._selected_api == 'syndication':
self.report_warning(
'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
status = self._download_json(
@ -1338,31 +1350,8 @@ class TwitterIE(TwitterBaseIE):
media.append(detail)
status['extended_entities'] = {'media': media}
return status
def _extract_status(self, twid):
if self._selected_api not in ('graphql', 'legacy', 'syndication'):
raise ExtractorError(f'{self._selected_api!r} is not a valid API selection', expected=True)
try:
if self.is_logged_in or self._selected_api == 'graphql':
status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
elif self._selected_api == 'legacy':
status = self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
})
except ExtractorError as e:
if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
raise
self.report_warning('Rate-limit exceeded; falling back to syndication endpoint')
status = self._call_syndication_api(twid)
if self._selected_api == 'syndication':
status = self._call_syndication_api(twid)
else:
raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)
return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}
@ -1427,8 +1416,8 @@ class TwitterIE(TwitterBaseIE):
'thumbnails': thumbnails,
'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available
'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
# Prioritize m3u8 formats for compat, see https://github.com/yt-dlp/yt-dlp/issues/8117
'_format_sort_fields': ('res', 'proto:m3u8', 'br', 'size'), # http format codec is unknown
# The codec of http formats are unknown
'_format_sort_fields': ('res', 'br', 'size', 'proto'),
}
def extract_from_card_info(card):