Compare commits

...

8 Commits

Author SHA1 Message Date
Subrat Lima
599a610957
Merge a060e2681e into f2a4983df7 2024-11-13 01:47:05 +01:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
manav_chaudhary
a9f85670d0
[ie/Chaturbate] Support alternate domains (#10595)
Closes #10594
Authored by: manavchaudhary1
2024-11-11 23:41:56 +01:00
Subrat Lima
a060e2681e
Update yt_dlp/extractor/subsplash.py
Co-authored-by: N/Ame <173015200+grqz@users.noreply.github.com>
2024-10-31 22:30:53 +05:30
Subrat Lima
96546d2147
Update yt_dlp/extractor/subsplash.py
Co-authored-by: N/Ame <173015200+grqz@users.noreply.github.com>
2024-10-31 22:30:37 +05:30
subrat-lima
dd1e36694b [ie/subsplash] fixed formatting in _extractors.py imports 2024-09-22 01:27:38 +05:30
subrat-lima
58c920f79a [ie/subsplash] added SubsplashVideoIE & SubsplashPlaylistIE 2024-09-22 01:09:26 +05:30
5 changed files with 179 additions and 9 deletions

View File

@ -1982,6 +1982,10 @@ from .streetvoice import StreetVoiceIE
from .stretchinternet import StretchInternetIE from .stretchinternet import StretchInternetIE
from .stripchat import StripchatIE from .stripchat import StripchatIE
from .stv import STVPlayerIE from .stv import STVPlayerIE
from .subsplash import (
SubsplashPlaylistIE,
SubsplashVideoIE,
)
from .substack import SubstackIE from .substack import SubstackIE
from .sunporno import SunPornoIE from .sunporno import SunPornoIE
from .sverigesradio import ( from .sverigesradio import (

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
}, },
}, },
], ],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}] }]
@staticmethod @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({ info['comments'].append({
'id': review.get('review_id'), 'id': review.get('review_id'),
'author': review.get('reviewer'), 'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')), 'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'}) 'parent': 'root'})

View File

@ -9,7 +9,7 @@ from ..utils import (
class ChaturbateIE(InfoExtractor): class ChaturbateIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)' _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.(?P<tld>com|eu|global)/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.chaturbate.com/siswet19/', 'url': 'https://www.chaturbate.com/siswet19/',
'info_dict': { 'info_dict': {
@ -29,15 +29,24 @@ class ChaturbateIE(InfoExtractor):
}, { }, {
'url': 'https://en.chaturbate.com/siswet19/', 'url': 'https://en.chaturbate.com/siswet19/',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://chaturbate.eu/siswet19/',
'only_matching': True,
}, {
'url': 'https://chaturbate.eu/fullvideo/?b=caylin',
'only_matching': True,
}, {
'url': 'https://chaturbate.global/siswet19/',
'only_matching': True,
}] }]
_ROOM_OFFLINE = 'Room is currently offline' _ROOM_OFFLINE = 'Room is currently offline'
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id, tld = self._match_valid_url(url).group('id', 'tld')
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.com/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers()) headers=self.geo_verification_headers())
found_m3u8_urls = [] found_m3u8_urls = []

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get( return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj( dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str) vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url')))) mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info): def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story'] video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info) video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')): ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url: if not playable_url:
continue continue
if determine_ext(playable_url) == 'mpd': if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id)) formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(fmt_data, formats) extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats: if not formats:
# Do not append false positive entry w/o any formats # Do not append false positive entry w/o any formats
return return

View File

@ -0,0 +1,112 @@
import functools
import math
from .common import InfoExtractor
from ..utils import (
InAdvancePagedList,
int_or_none,
str_or_none,
traverse_obj,
unified_strdate,
url_or_none,
)
class SubsplashVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?subsplash\.com/u/[^/]+/media/d/(?P<id>\w+)'
_TESTS = [{
'url': 'https://subsplash.com/u/skywatchtv/media/d/5whnx5s-the-grand-delusion-taking-place-right-now',
'md5': '2d67c50deac3c6c49c6e25c4a5b25afe',
'info_dict': {
'id': '33f8d305-68ab-414c-acf2-f2317a0abe21',
'ext': 'mp4',
'title': 'THE GRAND DELUSION TAKING PLACE RIGHT NOW!',
'description': 'md5:220a630865c3697b0ec9dcb3a70cbc33',
'upload_date': '20240901',
'duration': 1710,
'thumbnail': r're:https?://.*\.(?:jpg|png)$',
},
}, {
'url': 'https://subsplash.com/u/prophecywatchers/media/d/n4dr8b2-the-transhumanist-plan-for-humanity-billy-crone',
'md5': 'f7b4109ba7f012dff953391d6b400730',
'info_dict': {
'id': 'e16348f1-040e-4596-b922-77b45fa8d253',
'ext': 'mp4',
'title': 'The Transhumanist Plan for Humanity | Billy Crone',
'description': None,
'upload_date': '20240903',
'duration': 1709,
'thumbnail': r're:https?://.*\.(?:jpg|png)$',
},
}]
def _fetch_json(self, url, display_id, token):
return self._download_json(url, display_id, headers={'Authorization': f'Bearer {token}'})
def _extract_metadata(self, data, display_id):
return traverse_obj(data, {
'id': ('id', {str_or_none}),
'title': ('title', {str_or_none}),
'description': ('summary_text', {str_or_none}),
'thumbnail': ('_embedded', 'images', 0, '_links', 'related', 'href', {url_or_none}),
'duration': ('_embedded', 'video', 'duration', {lambda x: int_or_none(x, 1000)}),
'upload_date': ('published_at', {unified_strdate}),
'formats': ('_embedded', 'video', '_embedded', 'playlists', 0, '_links', 'related', 'href',
{lambda url: self._extract_m3u8_formats(url, display_id)}),
})
def _real_extract(self, url):
display_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, display_id)
token = urlh.get_header('set-cookie').split(';')[0].split('=')[1].strip()
metadata_url = f'https://core.subsplash.com/media/v1/media-items?filter[short_code]={display_id}&include=images,audio.audio-outputs,audio.video,video.video-outputs,video.playlists,document,broadcast'
metadata = self._fetch_json(metadata_url, display_id, token)
return self._extract_metadata(traverse_obj(metadata, ('_embedded', 'media-items', 0, {dict})), display_id)
class SubsplashPlaylistIE(SubsplashVideoIE):
IE_NAME = 'subsplash:playlist'
_VALID_URL = r'https?://(?:www\.)?subsplash\.com/[^/]+/(?:our-videos|media)/ms/\+(?P<id>\w+)'
_PAGE_SIZE = 15
_TESTS = [{
'url': 'https://subsplash.com/skywatchtv/our-videos/ms/+dbyjzp8',
'info_dict': {
'id': 'dbyjzp8',
'title': 'Five in Ten',
},
'playlist_mincount': 24,
}, {
'url': 'https://subsplash.com/prophecywatchers/media/ms/+n42mr48',
'info_dict': {
'id': 'n42mr48',
'title': 'Road to Zion Series',
},
'playlist_mincount': 13,
}, {
'url': 'https://subsplash.com/prophecywatchers/media/ms/+918b9f6',
'only_matching': True,
}]
def _get_entries(self, token, series_id, page):
url = f'https://core.subsplash.com/media/v1/media-items?filter[broadcast.status|broadcast.status]=null|on-demand&filter[media_series]={series_id}&filter[status]=published&include=images,audio.audio-outputs,audio.video,video.video-outputs,video.playlists,document&page[number]={page + 1}&page[size]={self._PAGE_SIZE}&sort=-position'
data = self._fetch_json(url, f'{series_id}_{page}', token)
entries = traverse_obj(data, ('_embedded', 'media-items', {list}))
for entry in entries:
yield self._extract_metadata(entry, series_id)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage, urlh = self._download_webpage_handle(url, display_id)
token = urlh.get_header('x-api-token')
series_url = f'https://core.subsplash.com/media/v1/media-series?filter[short_code]={display_id}'
json_data = self._fetch_json(series_url, display_id, token)
series_data = traverse_obj(json_data, ('_embedded', 'media-series', 0, {
'id': ('id', {str}),
'title': ('title', {str}),
'count': ('media_items_count', {int}),
}))
total_pages = math.ceil(series_data['count'] / self._PAGE_SIZE)
entries = InAdvancePagedList(functools.partial(self._get_entries, token, series_data['id']), total_pages, self._PAGE_SIZE)
return self.playlist_result(entries, display_id, series_data['title'])