Compare commits

..

8 Commits

Author SHA1 Message Date
alien-developers
91ca560381
Merge branch 'yt-dlp:master' into master 2023-12-25 07:46:11 +05:30
kclauhk
c39358a54b
[ie/Facebook] Fix Memories extraction (#8681)
- Support group /posts/ URLs
- Raise a proper error message if no formats are found

Closes #8669
Authored by: kclauhk
2023-12-24 23:43:35 +01:00
Lars Strojny
1f8bd8eba8
[ie/ARDBetaMediathek] Fix series extraction (#8687)
Closes #7666
Authored by: lstrojny
2023-12-24 23:38:21 +01:00
Simon Sawicki
00cdda4f6f
[core] Fix format selection parse error for CPython 3.12 (#8797)
Authored by: Grub4K
2023-12-24 22:09:01 +01:00
bashonly
116c268438
[ie/twitter] Work around API rate-limit (#8825)
Closes #8762
Authored by: bashonly
2023-12-24 16:41:28 +00:00
bashonly
e7d22348e7
[ie/twitter] Prioritize m3u8 formats (#8826)
Closes #8117
Authored by: bashonly
2023-12-24 16:40:50 +00:00
bashonly
50eaea9fd7
[ie/instagram] Fix stories extraction (#8843)
Closes #8290
Authored by: bashonly
2023-12-24 16:40:03 +00:00
bashonly
f45c4efcd9
[ie/litv] Fix premium content extraction (#8842)
Closes #8654
Authored by: bashonly
2023-12-24 16:33:16 +00:00
7 changed files with 127 additions and 72 deletions

View File

@ -140,6 +140,8 @@ class TestFormatSelection(unittest.TestCase):
test('example-with-dashes', 'example-with-dashes') test('example-with-dashes', 'example-with-dashes')
test('all', '2', '47', '45', 'example-with-dashes', '35') test('all', '2', '47', '45', 'example-with-dashes', '35')
test('mergeall', '2+47+45+example-with-dashes+35', multi=True) test('mergeall', '2+47+45+example-with-dashes+35', multi=True)
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
test('7_a/worst', '35')
def test_format_selection_audio(self): def test_format_selection_audio(self):
formats = [ formats = [

View File

@ -2465,9 +2465,16 @@ class YoutubeDL:
return selector_function(ctx_copy) return selector_function(ctx_copy)
return final_selector return final_selector
stream = io.BytesIO(format_spec.encode()) # HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid
# Prefix numbers with random letters to avoid it being classified as a number
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
# TODO: Implement parser not reliant on tokenize.tokenize
prefix = ''.join(random.choices(string.ascii_letters, k=32))
stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
try: try:
tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline))) tokens = list(_remove_unused_ops(
token._replace(string=token.string.replace(prefix, ''))
for token in tokenize.tokenize(stream.readline)))
except tokenize.TokenError: except tokenize.TokenError:
raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))

View File

@ -292,7 +292,7 @@ class ARDIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
# available till 7.12.2023 # available till 7.12.2023
'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
'md5': 'a438f671e87a7eba04000336a119ccc4', 'md5': '94812e6438488fb923c361a44469614b',
'info_dict': { 'info_dict': {
'id': 'maischberger-video-424', 'id': 'maischberger-video-424',
'display_id': 'maischberger-video-424', 'display_id': 'maischberger-video-424',
@ -403,26 +403,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
_VALID_URL = r'''(?x)https:// _VALID_URL = r'''(?x)https://
(?:(?:beta|www)\.)?ardmediathek\.de/ (?:(?:beta|www)\.)?ardmediathek\.de/
(?:(?P<client>[^/]+)/)? (?:(?P<client>[^/]+)/)?
(?:player|live|video|(?P<playlist>sendung|sammlung))/ (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
(?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)? (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))''' (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
_TESTS = [{ _TESTS = [{
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI', 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
'md5': '3fd5fead7a370a819341129c8d713136', 'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
'info_dict': { 'info_dict': {
'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen', 'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
'id': '12172961', 'id': '12939099',
'title': 'Wolfsland - Die traurigen Schwestern', 'title': 'Liebe auf vier Pfoten',
'description': r're:^Als der Polizeiobermeister Raaben', 'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
'duration': 5241, 'duration': 5222,
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957', 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
'timestamp': 1670710500, 'timestamp': 1701343800,
'upload_date': '20221210', 'upload_date': '20231130',
'ext': 'mp4', 'ext': 'mp4',
'age_limit': 12, 'episode': 'Liebe auf vier Pfoten',
'episode': 'Wolfsland - Die traurigen Schwestern',
'series': 'Filme im MDR' 'series': 'Filme im MDR'
}, },
}, { }, {
@ -454,7 +453,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'duration': 915, 'duration': 915,
'episode': 'tagesschau, 20:00 Uhr', 'episode': 'tagesschau, 20:00 Uhr',
'series': 'tagesschau', 'series': 'tagesschau',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49', 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
}, },
}, { }, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -475,6 +474,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
# playlist of type 'sendung' # playlist of type 'sendung'
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/', 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
'only_matching': True, 'only_matching': True,
}, {
# playlist of type 'serie'
'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
'only_matching': True,
}, { }, {
# playlist of type 'sammlung' # playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
@ -487,10 +490,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
""" Query the ARD server for playlist information """ Query the ARD server for playlist information
and returns the data in "raw" format """ and returns the data in "raw" format """
if mode == 'sendung': assert mode in ('sendung', 'serie', 'sammlung')
if mode in ('sendung', 'serie'):
graphQL = json.dumps({ graphQL = json.dumps({
'query': '''{ 'query': '''{
showPage( showPage(
@ -507,7 +511,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
links { target { id href title } } links { target { id href title } }
type type
} }
}}''' % (client, playlist_id, pageNumber), }}''' % (client, playlist_id, page_number),
}).encode() }).encode()
else: # mode == 'sammlung' else: # mode == 'sammlung'
graphQL = json.dumps({ graphQL = json.dumps({
@ -528,7 +532,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
type type
} }
} }
}}''' % (client, playlist_id, pageNumber), }}''' % (client, playlist_id, page_number),
}).encode() }).encode()
# Ressources for ARD graphQL debugging: # Ressources for ARD graphQL debugging:
# https://api-test.ardmediathek.de/public-gateway # https://api-test.ardmediathek.de/public-gateway
@ -538,7 +542,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
data=graphQL, data=graphQL,
headers={'Content-Type': 'application/json'})['data'] headers={'Content-Type': 'application/json'})['data']
# align the structure of the returned data: # align the structure of the returned data:
if mode == 'sendung': if mode in ('sendung', 'serie'):
show_page = show_page['showPage'] show_page = show_page['showPage']
else: # mode == 'sammlung' else: # mode == 'sammlung'
show_page = show_page['morePage']['widget'] show_page = show_page['morePage']['widget']
@ -546,12 +550,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode): def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
""" Collects all playlist entries and returns them as info dict. """ Collects all playlist entries and returns them as info dict.
Supports playlists of mode 'sendung' and 'sammlung', and also nested Supports playlists of mode 'sendung', 'serie', and 'sammlung',
playlists. """ as well as nested playlists. """
entries = [] entries = []
pageNumber = 0 pageNumber = 0
while True: # iterate by pageNumber while True: # iterate by pageNumber
show_page = self._ARD_load_playlist_snipped( show_page = self._ARD_load_playlist_snippet(
playlist_id, display_id, client, mode, pageNumber) playlist_id, display_id, client, mode, pageNumber)
for teaser in show_page['teasers']: # process playlist items for teaser in show_page['teasers']: # process playlist items
if '/compilation/' in teaser['links']['target']['href']: if '/compilation/' in teaser['links']['target']['href']:

View File

@ -52,7 +52,7 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=| )\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?| [^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/| [^/]+/posts/|
groups/[^/]+/permalink/| groups/[^/]+/(?:permalink|posts)/|
watchparty/ watchparty/
)| )|
facebook: facebook:
@ -232,6 +232,21 @@ class FacebookIE(InfoExtractor):
'uploader_id': '100013949973717', 'uploader_id': '100013949973717',
}, },
'skip': 'Requires logging in', 'skip': 'Requires logging in',
}, {
# data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
'info_dict': {
'id': '1569199726448814',
'ext': 'mp4',
'title': 'Pence MUST GO!',
'description': 'Vickie Gentry shared a memory.',
'timestamp': 1511548260,
'upload_date': '20171124',
'uploader': 'Vickie Gentry',
'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
'thumbnail': r're:^https?://.*',
'duration': 148.435,
},
}, { }, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True, 'only_matching': True,
@ -612,9 +627,11 @@ class FacebookIE(InfoExtractor):
nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
attachments = traverse_obj(nodes, ( attachments = traverse_obj(nodes, (
..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or [] ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
'attachment', {dict}))
for attachment in attachments: for attachment in attachments:
ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
('target', 'attachments', ..., 'styles', 'attachment', {dict}))
for n in ns: for n in ns:
parse_attachment(n) parse_attachment(n)
parse_attachment(attachment) parse_attachment(attachment)
@ -637,7 +654,7 @@ class FacebookIE(InfoExtractor):
if len(entries) > 1: if len(entries) > 1:
return self.playlist_result(entries, video_id) return self.playlist_result(entries, video_id)
video_info = entries[0] video_info = entries[0] if entries else {'id': video_id}
webpage_info = extract_metadata(webpage) webpage_info = extract_metadata(webpage)
# honor precise duration in video info # honor precise duration in video info
if video_info.get('duration'): if video_info.get('duration'):

View File

@ -10,6 +10,7 @@ from ..utils import (
ExtractorError, ExtractorError,
decode_base_n, decode_base_n,
encode_base_n, encode_base_n,
filter_dict,
float_or_none, float_or_none,
format_field, format_field,
get_element_by_attribute, get_element_by_attribute,
@ -703,28 +704,31 @@ class InstagramStoryIE(InstagramBaseIE):
user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False)
if not user_info: if not user_info:
self.raise_login_required('This content is unreachable') self.raise_login_required('This content is unreachable')
user_id = user_info.get('id')
user_id = traverse_obj(user_info, 'pk', 'id', expected_type=str)
story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}'
if not story_info_url: # user id is only mandatory for non-highlights
raise ExtractorError('Unable to extract user id')
videos = traverse_obj(self._download_json( videos = traverse_obj(self._download_json(
f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}',
story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels')
if not videos: if not videos:
self.raise_login_required('You need to log in to access this content') self.raise_login_required('You need to log in to access this content')
full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name')) full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (user_id, 'user', 'full_name'))
story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title'))
if not story_title: if not story_title:
story_title = f'Story by {username}' story_title = f'Story by {username}'
highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (user_id, 'items'))
info_data = [] info_data = []
for highlight in highlights: for highlight in highlights:
highlight_data = self._extract_product(highlight) highlight_data = self._extract_product(highlight)
if highlight_data.get('formats'): if highlight_data.get('formats'):
info_data.append({ info_data.append({
**highlight_data,
'uploader': full_name, 'uploader': full_name,
'uploader_id': user_id, 'uploader_id': user_id,
**filter_dict(highlight_data),
}) })
return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title)

View File

@ -6,6 +6,7 @@ from ..utils import (
int_or_none, int_or_none,
smuggle_url, smuggle_url,
traverse_obj, traverse_obj,
try_call,
unsmuggle_url, unsmuggle_url,
) )
@ -96,13 +97,22 @@ class LiTVIE(InfoExtractor):
r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
webpage, 'video data', default='{}'), video_id) webpage, 'video data', default='{}'), video_id)
if not video_data: if not video_data:
payload = { payload = {'assetId': program_info['assetId']}
'assetId': program_info['assetId'], puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
if puid:
payload.update({
'type': 'auth',
'puid': puid,
})
endpoint = 'getUrl'
else:
payload.update({
'watchDevices': program_info['watchDevices'], 'watchDevices': program_info['watchDevices'],
'contentType': program_info['contentType'], 'contentType': program_info['contentType'],
} })
endpoint = 'getMainUrlNoAuth'
video_data = self._download_json( video_data = self._download_json(
'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id, f'https://www.litv.tv/vod/ajax/{endpoint}', video_id,
data=json.dumps(payload).encode('utf-8'), data=json.dumps(payload).encode('utf-8'),
headers={'Content-Type': 'application/json'}) headers={'Content-Type': 'application/json'})

View File

@ -10,6 +10,7 @@ from ..compat import (
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
) )
from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
dict_get, dict_get,
@ -1317,20 +1318,7 @@ class TwitterIE(TwitterBaseIE):
} }
} }
def _extract_status(self, twid): def _call_syndication_api(self, twid):
if self.is_logged_in or self._selected_api == 'graphql':
status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
elif self._selected_api == 'legacy':
status = self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
})
elif self._selected_api == 'syndication':
self.report_warning( self.report_warning(
'Not all metadata or media is available via syndication endpoint', twid, only_once=True) 'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
status = self._download_json( status = self._download_json(
@ -1350,8 +1338,31 @@ class TwitterIE(TwitterBaseIE):
media.append(detail) media.append(detail)
status['extended_entities'] = {'media': media} status['extended_entities'] = {'media': media}
else: return status
raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)
def _extract_status(self, twid):
if self._selected_api not in ('graphql', 'legacy', 'syndication'):
raise ExtractorError(f'{self._selected_api!r} is not a valid API selection', expected=True)
try:
if self.is_logged_in or self._selected_api == 'graphql':
status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
elif self._selected_api == 'legacy':
status = self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
'include_user_entities': 0,
'tweet_mode': 'extended',
})
except ExtractorError as e:
if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
raise
self.report_warning('Rate-limit exceeded; falling back to syndication endpoint')
status = self._call_syndication_api(twid)
if self._selected_api == 'syndication':
status = self._call_syndication_api(twid)
return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {} return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}
@ -1416,8 +1427,8 @@ class TwitterIE(TwitterBaseIE):
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), # No longer available
'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000),
# The codec of http formats are unknown # Prioritize m3u8 formats for compat, see https://github.com/yt-dlp/yt-dlp/issues/8117
'_format_sort_fields': ('res', 'br', 'size', 'proto'), '_format_sort_fields': ('res', 'proto:m3u8', 'br', 'size'), # http format codec is unknown
} }
def extract_from_card_info(card): def extract_from_card_info(card):