Compare commits

..

4 Commits

Author SHA1 Message Date
bashonly
d9609608a8
Merge branch 'yt-dlp:master' into cleanup/2023-12 2023-12-25 16:34:12 -06:00
kclauhk
c39358a54b
[ie/Facebook] Fix Memories extraction (#8681)
- Support group /posts/ URLs
- Raise a proper error message if no formats are found

Closes #8669
Authored by: kclauhk
2023-12-24 23:43:35 +01:00
Lars Strojny
1f8bd8eba8
[ie/ARDBetaMediathek] Fix series extraction (#8687)
Closes #7666
Authored by: lstrojny
2023-12-24 23:38:21 +01:00
Simon Sawicki
00cdda4f6f
[core] Fix format selection parse error for CPython 3.12 (#8797)
Authored by: Grub4K
2023-12-24 22:09:01 +01:00
4 changed files with 59 additions and 29 deletions

View File

@ -140,6 +140,8 @@ class TestFormatSelection(unittest.TestCase):
test('example-with-dashes', 'example-with-dashes') test('example-with-dashes', 'example-with-dashes')
test('all', '2', '47', '45', 'example-with-dashes', '35') test('all', '2', '47', '45', 'example-with-dashes', '35')
test('mergeall', '2+47+45+example-with-dashes+35', multi=True) test('mergeall', '2+47+45+example-with-dashes+35', multi=True)
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
test('7_a/worst', '35')
def test_format_selection_audio(self): def test_format_selection_audio(self):
formats = [ formats = [

View File

@ -2471,9 +2471,16 @@ class YoutubeDL:
return selector_function(ctx_copy) return selector_function(ctx_copy)
return final_selector return final_selector
stream = io.BytesIO(format_spec.encode()) # HACK: Python 3.12 changed the underlying parser, rendering '7_a' invalid
# Prefix numbers with random letters to avoid it being classified as a number
# See: https://github.com/yt-dlp/yt-dlp/pulls/8797
# TODO: Implement parser not reliant on tokenize.tokenize
prefix = ''.join(random.choices(string.ascii_letters, k=32))
stream = io.BytesIO(re.sub(r'\d[_\d]*', rf'{prefix}\g<0>', format_spec).encode())
try: try:
tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline))) tokens = list(_remove_unused_ops(
token._replace(string=token.string.replace(prefix, ''))
for token in tokenize.tokenize(stream.readline)))
except tokenize.TokenError: except tokenize.TokenError:
raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))

View File

@ -292,7 +292,7 @@ class ARDIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
# available till 7.12.2023 # available till 7.12.2023
'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html', 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
'md5': 'a438f671e87a7eba04000336a119ccc4', 'md5': '94812e6438488fb923c361a44469614b',
'info_dict': { 'info_dict': {
'id': 'maischberger-video-424', 'id': 'maischberger-video-424',
'display_id': 'maischberger-video-424', 'display_id': 'maischberger-video-424',
@ -403,26 +403,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
_VALID_URL = r'''(?x)https:// _VALID_URL = r'''(?x)https://
(?:(?:beta|www)\.)?ardmediathek\.de/ (?:(?:beta|www)\.)?ardmediathek\.de/
(?:(?P<client>[^/]+)/)? (?:(?P<client>[^/]+)/)?
(?:player|live|video|(?P<playlist>sendung|sammlung))/ (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
(?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)? (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
(?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
(?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))''' (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
_TESTS = [{ _TESTS = [{
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy8xZGY0ZGJmZS00ZWQwLTRmMGItYjhhYy0wOGQ4ZmYxNjVhZDI', 'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
'md5': '3fd5fead7a370a819341129c8d713136', 'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
'info_dict': { 'info_dict': {
'display_id': 'filme-im-mdr/wolfsland-die-traurigen-schwestern/mdr-fernsehen', 'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
'id': '12172961', 'id': '12939099',
'title': 'Wolfsland - Die traurigen Schwestern', 'title': 'Liebe auf vier Pfoten',
'description': r're:^Als der Polizeiobermeister Raaben', 'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
'duration': 5241, 'duration': 5222,
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:efa186f7b0054957', 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
'timestamp': 1670710500, 'timestamp': 1701343800,
'upload_date': '20221210', 'upload_date': '20231130',
'ext': 'mp4', 'ext': 'mp4',
'age_limit': 12, 'episode': 'Liebe auf vier Pfoten',
'episode': 'Wolfsland - Die traurigen Schwestern',
'series': 'Filme im MDR' 'series': 'Filme im MDR'
}, },
}, { }, {
@ -454,7 +453,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'duration': 915, 'duration': 915,
'episode': 'tagesschau, 20:00 Uhr', 'episode': 'tagesschau, 20:00 Uhr',
'series': 'tagesschau', 'series': 'tagesschau',
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49', 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
}, },
}, { }, {
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -475,6 +474,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
# playlist of type 'sendung' # playlist of type 'sendung'
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/', 'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
'only_matching': True, 'only_matching': True,
}, {
# playlist of type 'serie'
'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
'only_matching': True,
}, { }, {
# playlist of type 'sammlung' # playlist of type 'sammlung'
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
@ -487,10 +490,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
""" Query the ARD server for playlist information """ Query the ARD server for playlist information
and returns the data in "raw" format """ and returns the data in "raw" format """
if mode == 'sendung': assert mode in ('sendung', 'serie', 'sammlung')
if mode in ('sendung', 'serie'):
graphQL = json.dumps({ graphQL = json.dumps({
'query': '''{ 'query': '''{
showPage( showPage(
@ -507,7 +511,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
links { target { id href title } } links { target { id href title } }
type type
} }
}}''' % (client, playlist_id, pageNumber), }}''' % (client, playlist_id, page_number),
}).encode() }).encode()
else: # mode == 'sammlung' else: # mode == 'sammlung'
graphQL = json.dumps({ graphQL = json.dumps({
@ -528,7 +532,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
type type
} }
} }
}}''' % (client, playlist_id, pageNumber), }}''' % (client, playlist_id, page_number),
}).encode() }).encode()
# Ressources for ARD graphQL debugging: # Ressources for ARD graphQL debugging:
# https://api-test.ardmediathek.de/public-gateway # https://api-test.ardmediathek.de/public-gateway
@ -538,7 +542,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
data=graphQL, data=graphQL,
headers={'Content-Type': 'application/json'})['data'] headers={'Content-Type': 'application/json'})['data']
# align the structure of the returned data: # align the structure of the returned data:
if mode == 'sendung': if mode in ('sendung', 'serie'):
show_page = show_page['showPage'] show_page = show_page['showPage']
else: # mode == 'sammlung' else: # mode == 'sammlung'
show_page = show_page['morePage']['widget'] show_page = show_page['morePage']['widget']
@ -546,12 +550,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode): def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
""" Collects all playlist entries and returns them as info dict. """ Collects all playlist entries and returns them as info dict.
Supports playlists of mode 'sendung' and 'sammlung', and also nested Supports playlists of mode 'sendung', 'serie', and 'sammlung',
playlists. """ as well as nested playlists. """
entries = [] entries = []
pageNumber = 0 pageNumber = 0
while True: # iterate by pageNumber while True: # iterate by pageNumber
show_page = self._ARD_load_playlist_snipped( show_page = self._ARD_load_playlist_snippet(
playlist_id, display_id, client, mode, pageNumber) playlist_id, display_id, client, mode, pageNumber)
for teaser in show_page['teasers']: # process playlist items for teaser in show_page['teasers']: # process playlist items
if '/compilation/' in teaser['links']['target']['href']: if '/compilation/' in teaser['links']['target']['href']:

View File

@ -52,7 +52,7 @@ class FacebookIE(InfoExtractor):
)\?(?:.*?)(?:v|video_id|story_fbid)=| )\?(?:.*?)(?:v|video_id|story_fbid)=|
[^/]+/videos/(?:[^/]+/)?| [^/]+/videos/(?:[^/]+/)?|
[^/]+/posts/| [^/]+/posts/|
groups/[^/]+/permalink/| groups/[^/]+/(?:permalink|posts)/|
watchparty/ watchparty/
)| )|
facebook: facebook:
@ -232,6 +232,21 @@ class FacebookIE(InfoExtractor):
'uploader_id': '100013949973717', 'uploader_id': '100013949973717',
}, },
'skip': 'Requires logging in', 'skip': 'Requires logging in',
}, {
# data.node.comet_sections.content.story.attachments[].throwbackStyles.attachment_target_renderer.attachment.target.attachments[].styles.attachment.media
'url': 'https://www.facebook.com/groups/1645456212344334/posts/3737828833107051/',
'info_dict': {
'id': '1569199726448814',
'ext': 'mp4',
'title': 'Pence MUST GO!',
'description': 'Vickie Gentry shared a memory.',
'timestamp': 1511548260,
'upload_date': '20171124',
'uploader': 'Vickie Gentry',
'uploader_id': 'pfbid0FuZhHCeWDAxWxEbr3yKPFaRstXvRxgsp9uCPG6GjD4J2AitB35NUAuJ4Q75KcjiDl',
'thumbnail': r're:^https?://.*',
'duration': 148.435,
},
}, { }, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True, 'only_matching': True,
@ -612,9 +627,11 @@ class FacebookIE(InfoExtractor):
nodes = variadic(traverse_obj(data, 'nodes', 'node') or []) nodes = variadic(traverse_obj(data, 'nodes', 'node') or [])
attachments = traverse_obj(nodes, ( attachments = traverse_obj(nodes, (
..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments', ..., 'comet_sections', 'content', 'story', (None, 'attached_story'), 'attachments',
..., ('styles', 'style_type_renderer'), 'attachment'), expected_type=dict) or [] ..., ('styles', 'style_type_renderer', ('throwbackStyles', 'attachment_target_renderer')),
'attachment', {dict}))
for attachment in attachments: for attachment in attachments:
ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] ns = traverse_obj(attachment, ('all_subattachments', 'nodes', ..., {dict}),
('target', 'attachments', ..., 'styles', 'attachment', {dict}))
for n in ns: for n in ns:
parse_attachment(n) parse_attachment(n)
parse_attachment(attachment) parse_attachment(attachment)
@ -637,7 +654,7 @@ class FacebookIE(InfoExtractor):
if len(entries) > 1: if len(entries) > 1:
return self.playlist_result(entries, video_id) return self.playlist_result(entries, video_id)
video_info = entries[0] video_info = entries[0] if entries else {'id': video_id}
webpage_info = extract_metadata(webpage) webpage_info = extract_metadata(webpage)
# honor precise duration in video info # honor precise duration in video info
if video_info.get('duration'): if video_info.get('duration'):