Compare commits

...

16 Commits

Author SHA1 Message Date
N/Ame
5fa0e2c6ce
Merge ea7d7d85d3 into f2a4983df7 2024-11-13 01:47:10 +01:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
grqx_wsl
ea7d7d85d3 request with impersonate=True 2024-11-01 11:15:54 +13:00
grqx_wsl
8e010ace41 make formats extraction non-fatal when geo-blocked or login required 2024-11-01 00:00:59 +13:00
grqx_wsl
0a05711805 float_or_none: use partial_application 2024-10-31 23:37:35 +13:00
grqx_wsl
6c6d75be16 Merge remote-tracking branch 'upstream/master' into ie/bahamut 2024-10-31 23:34:04 +13:00
grqx_wsl
50b2820684 Merge remote-tracking branch 'upstream/master' into ie/anigamer 2024-10-30 18:27:57 +13:00
grqx_wsl
ce031318fd [ie/anigamer] rename to bahamut 2024-10-30 18:11:38 +13:00
grqx_wsl
f8e15176cb add http headers at the root of the info_dict
simplify structure
2024-10-30 09:19:58 +13:00
grqx_wsl
2ad1cbf12d add http_headers for downloading 2024-10-29 13:14:30 +13:00
grqx_wsl
106f6c931b Merge remote-tracking branch 'upstream/master' into ie/anigamer 2024-10-28 13:28:21 +13:00
grqx_wsl
3b58fd1a20 add http_headers, extract all formats 2024-10-28 12:01:01 +13:00
grqx_wsl
99665e31b9 Merge remote-tracking branch 'upstream/master' into ie/anigamer 2024-10-28 11:28:14 +13:00
grqx_wsl
058ac436ba Use the same device_id in playlists 2024-10-23 18:32:11 +13:00
grqx_wsl
f87558f7a5 [ie/anigamer] add extractor 2024-10-23 10:40:48 +13:00
5 changed files with 166 additions and 6 deletions

View File

@ -1869,6 +1869,9 @@ The following extractors use this feature:
#### digitalconcerthall #### digitalconcerthall
* `prefer_combined_hls`: Prefer extracting combined/pre-merged video and audio HLS formats. This will exclude 4K/HEVC video and lossless/FLAC audio formats, which are only available as split video/audio HLS formats * `prefer_combined_hls`: Prefer extracting combined/pre-merged video and audio HLS formats. This will exclude 4K/HEVC video and lossless/FLAC audio formats, which are only available as split video/audio HLS formats
#### bahamut
* `device_id`: (optional) Device ID got from `https://ani.gamer.com.tw/ajax/getdeviceid.php` (bound to cookies and `User-Agent` HTTP header). The extractor will automatically fetch one if it is not present. E.g. `"bahamut:device_id=1234567890abcdef1234567890abcdef1234567890abcdef1234567890ab"`
**Note**: These options may be changed/removed in the future without concern for backward compatibility **Note**: These options may be changed/removed in the future without concern for backward compatibility
<!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE --> <!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE -->

View File

@ -196,6 +196,7 @@ from .awaan import (
) )
from .axs import AxsIE from .axs import AxsIE
from .azmedien import AZMedienIE from .azmedien import AZMedienIE
from .bahamut import BahamutIE
from .baidu import BaiduVideoIE from .baidu import BaiduVideoIE
from .banbye import ( from .banbye import (
BanByeChannelIE, BanByeChannelIE,

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
}, },
}, },
], ],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}] }]
@staticmethod @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({ info['comments'].append({
'id': review.get('review_id'), 'id': review.get('review_id'),
'author': review.get('reviewer'), 'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')), 'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'}) 'parent': 'root'})

111
yt_dlp/extractor/bahamut.py Normal file
View File

@ -0,0 +1,111 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
smuggle_url,
unified_timestamp,
unsmuggle_url,
)
from ..utils.traversal import traverse_obj
class BahamutIE(InfoExtractor):
_VALID_URL = r'https?://ani\.gamer\.com\.tw/animeVideo\.php\?sn=(?P<id>\d+)'
# see anime_player.js
RATING_TO_AGE_LIMIT = {
1: 0,
2: 6,
3: 12,
4: 15,
5: 18,
6: 18, # age-gated, needs login
}
def _real_extract(self, url):
url, unsmuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
device_id = (
self._configuration_arg('device_id', [None], casesense=True)[0]
or unsmuggled_data.get('device_id')
or self._download_json(
'https://ani.gamer.com.tw/ajax/getdeviceid.php', video_id,
'Downloading device ID', 'Failed to download device ID',
impersonate=True, headers=self.geo_verification_headers())['deviceid'])
# TODO: extract metadata from webpage
metadata = {}
if api_result := self._download_json(
'https://api.gamer.com.tw/anime/v1/video.php', video_id,
'Downloading video info', 'Failed to download video info',
impersonate=True, query={'videoSn': video_id}).get('data'):
metadata.update(traverse_obj(api_result, ('anime', {
'description': 'content',
'thumbnail': 'cover',
'tags': 'tags',
'creators': ('director', {lambda x: [x]}),
'title': 'title',
})))
playlist_id = traverse_obj(api_result, ('video', 'animeSn')) or ''
if self._yes_playlist(playlist_id, video_id) and unsmuggled_data.get('extract_playlist') is not False:
return self.playlist_result(
(self.url_result(
# it may be better to use self.cache for storing device_id
smuggle_url(f'https://ani.gamer.com.tw/animeVideo.php?sn={ep["videoSn"]}', {
'extract_playlist': False,
'device_id': device_id,
}), ie=BahamutIE,
video_id=ep['videoSn'], thumbnail=ep.get('cover')) for ep in traverse_obj(
api_result,
# This (the first ellipsis) extracts episodes of all languages,
# maybe just extract episodes of the current language?
('anime', 'episodes', ..., ...))),
playlist_id=playlist_id, **metadata)
# video-specific metadata, extract after returning the playlist result
metadata.update(traverse_obj(api_result, ('video', {
'thumbnail': 'cover',
'title': 'title',
'timestamp': ('upTime', {unified_timestamp}),
'duration': ('duration', {float_or_none(scale=60)}),
'age_limit': ('rating', {lambda x: self.RATING_TO_AGE_LIMIT.get(x)}),
})))
m3u8_info, urlh = self._download_json_handle(
'https://ani.gamer.com.tw/ajax/m3u8.php', video_id,
note='Downloading m3u8 URL', errnote='Failed to download m3u8 URL', query={
'sn': video_id,
'device': device_id,
}, impersonate=True, headers=self.geo_verification_headers(), expected_status=400)
formats_fatal = True
if urlh.status == 400:
# TODO: handle more error codes, search for /case \d+{4}:/g in anime_player.js
error_code = traverse_obj(m3u8_info, ('error', 'code'))
if error_code == 1011:
self.raise_geo_restricted(metadata_available=True)
formats_fatal = False
elif error_code == 1007:
if unsmuggled_data.pop('device_id', None) is not None:
return self.url_result(
smuggle_url(f'https://ani.gamer.com.tw/animeVideo.php?sn={video_id}',
unsmuggled_data), ie=BahamutIE, video_id=video_id)
raise ExtractorError('Invalid device id!')
elif error_code == 1017:
self.raise_login_required(metadata_available=True)
formats_fatal = False
else:
raise ExtractorError(
traverse_obj(m3u8_info, ('error', 'message')) or 'Failed to download m3u8 URL')
return {
**metadata,
'id': video_id,
'formats': self._extract_m3u8_formats(
m3u8_info.get('src'), video_id, ext='mp4', fatal=formats_fatal, headers={
'Origin': 'https://ani.gamer.com.tw',
**self.geo_verification_headers(),
}),
'http_headers': {'Origin': 'https://ani.gamer.com.tw'},
}

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get( return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj( dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str) vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url')))) mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info): def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story'] video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info) video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')): ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url: if not playable_url:
continue continue
if determine_ext(playable_url) == 'mpd': if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id)) formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(fmt_data, formats) extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats: if not formats:
# Do not append false positive entry w/o any formats # Do not append false positive entry w/o any formats
return return