Compare commits

...

7 Commits

Author SHA1 Message Date
Michael Skyba
816a6dd032
Merge 2be0c24897 into f2a4983df7 2024-11-13 01:47:06 +01:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
Michael Skyba
2be0c24897 [ie/suno] add fallbacks for basic page metadata 2024-11-04 23:54:46 -05:00
Michael Skyba
10a1a93352 [ie/suno] use regexes for thumbnail match tests 2024-11-04 23:54:45 -05:00
Michael Skyba
6ec19e942d [ie/suno] add playlist extractor 2024-11-04 23:54:43 -05:00
Michael Skyba
3c59d3e7a2 [ie/suno] add /song mp3 extractor 2024-11-04 20:18:18 -05:00
4 changed files with 204 additions and 6 deletions

View File

@ -1983,6 +1983,7 @@ from .stretchinternet import StretchInternetIE
from .stripchat import StripchatIE from .stripchat import StripchatIE
from .stv import STVPlayerIE from .stv import STVPlayerIE
from .substack import SubstackIE from .substack import SubstackIE
from .suno import SunoIE, SunoPlaylistIE
from .sunporno import SunPornoIE from .sunporno import SunPornoIE
from .sverigesradio import ( from .sverigesradio import (
SverigesRadioEpisodeIE, SverigesRadioEpisodeIE,

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
}, },
}, },
], ],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}] }]
@staticmethod @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({ info['comments'].append({
'id': review.get('review_id'), 'id': review.get('review_id'),
'author': review.get('reviewer'), 'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')), 'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'}) 'parent': 'root'})

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get( return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj( dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str) vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url')))) mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info): def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story'] video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info) video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')): ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url: if not playable_url:
continue continue
if determine_ext(playable_url) == 'mpd': if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id)) formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(fmt_data, formats) extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats: if not formats:
# Do not append false positive entry w/o any formats # Do not append false positive entry w/o any formats
return return

152
yt_dlp/extractor/suno.py Normal file
View File

@ -0,0 +1,152 @@
import re
from .common import InfoExtractor
from ..utils import ExtractorError, unescapeHTML
class SunoBaseIE(InfoExtractor):
def _get_title(self, webpage):
return self._html_search_meta(
['og:title', 'twitter:title'], webpage, 'title',
default=None) or self._html_extract_title(webpage)
def _get_description(self, webpage):
return self._html_search_meta(
['og:description', 'description', 'twitter:description'],
webpage, 'description', default=None)
def _get_thumbnail(self, webpage):
return self._html_search_meta(
['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
class SunoIE(SunoBaseIE):
_VALID_URL = r'https?://(?:www\.)?suno\.com/song/(?P<id>[-a-f0-9]+)'
_TESTS = [
{
'url': 'https://suno.com/song/ab39a04d-b2e6-463b-9b8e-ddea725422f5',
'md5': 'ef850763b175d8a3c7fba5e2dbdc6bc5',
'info_dict': {
'id': 'ab39a04d-b2e6-463b-9b8e-ddea725422f5',
'title': 'Life\'s a Soundtrack · AI Funk Factory @ YT by @funk | Suno',
'description': 'groovy funk, melodic song. Listen and make your own with Suno.',
'thumbnail': r're:https?://.*903f2bd7-ccc0-4029-a76a-887f07ebc2df.*\.jpeg$',
'ext': 'mp3',
},
},
{
'url': 'https://suno.com/song/9cbcb5f4-f367-4f1c-8a32-23ec62bdc47e',
'md5': '2f038badef88d189891d5f8cd8d8804d',
'info_dict': {
'id': '9cbcb5f4-f367-4f1c-8a32-23ec62bdc47e',
'title': 'Pequenos Prazeres da Vida by @groovebot | Suno',
'description': 'pop bossa nova song. Listen and make your own with Suno.',
'thumbnail': r're:https?://.*9cbcb5f4-f367-4f1c-8a32-23ec62bdc47e.*\.jpeg$',
'ext': 'mp3',
},
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
url = self._html_search_meta(
['og:audio', 'twitter:player:stream'], webpage, 'url', default=None)
return {
'id': video_id,
'title': self._get_title(webpage),
'description': self._get_description(webpage),
'thumbnail': self._get_thumbnail(webpage),
'url': url,
}
class SunoPlaylistIE(SunoBaseIE):
_VALID_URL = r'https?://(?:www\.)?suno\.com/playlist/(?P<id>[-a-f0-9]+)'
_TESTS = [
{
'url': 'https://suno.com/playlist/01f2ac32-c32e-4d26-b10c-221107c02946',
'info_dict': {
'id': '01f2ac32-c32e-4d26-b10c-221107c02946',
'title': 'Main 0 by @contemplativetranspositions367 | Suno',
'description': 'Hopefully the test case passed',
'thumbnail': r're:https?://.*19d6d518-1b87-43b3-90b9-2a476ca5824a.*\.jpeg$',
},
'playlist': [{
'info_dict': {
'id': '19d6d518-1b87-43b3-90b9-2a476ca5824a',
'title': 'Ceaseless <Echoes>',
'ext': 'mp3',
},
}],
'playlist_count': 1,
},
{
'url': 'https://www.suno.com/playlist/568eeaab-dfbf-4da6-aa0a-0fb1a32330de',
'info_dict': {
'id': '568eeaab-dfbf-4da6-aa0a-0fb1a32330de',
'title': 'Piano by @kunal | Suno',
'description': 'Here are some good piano',
'thumbnail': r're:https?://.*0ecc0956-3b17-4d4b-8504-55849dd75e22.*\.jpeg$',
},
'playlist': [
{
'info_dict': {
'id': '0ecc0956-3b17-4d4b-8504-55849dd75e22',
'title': 'ST',
'ext': 'mp3',
},
},
{
'info_dict': {
'id': '3fef7d44-c5a3-4181-9de3-d81542af23ef',
'title': 'ST',
'ext': 'mp3',
},
},
{
'info_dict': {
'id': '15e797fa-06c0-4e11-8cc0-3b2580476039',
'title': 'ST - 2',
'ext': 'mp3',
},
},
],
'playlist_count': 3,
},
]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
# There are <a>s whose href is a song/ID path. The <span>s directly
# within them have the respective song title as their innerHTML.
# Alternatively, this info can be extracted through parsing an escaped
# JSON object inside a <script> array, though that seems even less stable
# than this HTML.
songs_regex = r'/song/(?P<id>[-a-f0-9]+)["\'][^>]*>\s*<span[^>]*>\s*(?P<title>[^<]+)</span>'
songs = re.findall(songs_regex, webpage)
og_audio_regex = self._og_regexes('audio')[0]
audio_urls = [matches[0] for matches in re.findall(og_audio_regex, webpage)]
if len(songs) != len(audio_urls):
raise ExtractorError('Unexpected mismatch between song HTML list and og audio URLs')
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._get_title(webpage),
'description': self._get_description(webpage),
'thumbnail': self._get_thumbnail(webpage),
'entries': [{
'id': song_tuple[0],
'title': unescapeHTML(song_tuple[1]),
'url': audio_urls[i],
} for i, song_tuple in enumerate(songs)],
}