Compare commits

...

28 Commits

Author SHA1 Message Date
ChocoLZS
6cb9dcb010
Merge 8f5a765e25 into f2a4983df7 2024-11-14 03:16:51 +01:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
ChocoLZS
8f5a765e25
Update yt_dlp/extractor/piaulizaportal.py
Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-10-28 11:22:18 +08:00
ChocoLZS
75ea808d0a
Apply suggestions from code review
Co-authored-by: sepro <sepro@sepr0.com>
2024-10-05 22:49:09 +08:00
ChocoLZS
b42637c619
Apply suggestions from code review
Co-authored-by: sepro <sepro@sepr0.com>
2024-10-01 16:31:47 +08:00
Mozi
a75a02ad2a
[ie/pialive] Follow your steps (#1)
* [ie/pialive] Support detecting upcoming and ended live events

* Pack API arguments

* fix UnboundLocalError for "chat_room_url"

* extract video_id from query string by "parse_qs()"

* Fix tests
2024-09-25 14:47:15 +08:00
ChocoLZS
d993580e6f
Apply suggestions from code review
Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-09-11 09:15:34 +08:00
chocoie
0f4cdc03d9 fix: suggestions 2024-09-09 21:35:40 +08:00
chocoie
7b94a0000c fix: error 2024-09-09 21:27:29 +08:00
ChocoLZS
aa410c803b
Apply suggestions from code review
Co-authored-by: sepro <sepro@sepr0.com>
2024-09-09 21:13:03 +08:00
ChocoLZS
83f4c5a98e fix: code style 2024-08-28 22:48:42 +08:00
ChocoLZS
841a557c0e fix: code style 2024-08-28 22:35:13 +08:00
ChocoLZS
bca2ca9852 fix: code style 2024-08-27 08:35:10 +00:00
ChocoLZS
a2ed14747b fix: code style 2024-08-25 23:49:08 +08:00
ChocoLZS
04f1bfde50 feat: use extract_comments instead 2024-08-25 23:26:37 +08:00
ChocoLZS
848a923252 doc: add note for json downloader 2024-08-24 23:44:28 +08:00
ChocoLZS
1ff33d1333 fix: remove unnecessary code 2024-08-24 23:34:06 +08:00
ChocoLZS
27b31cc3df chore: correct tests 2024-08-24 17:44:14 +08:00
ChocoLZS
fad7c2a75a chore: remove smuggled_url 2024-08-23 23:12:18 +08:00
ChocoLZS
1a22cc3d3f chore: remove unnecessary smuggled data 2024-08-23 10:01:07 +08:00
ChocoLZS
95dffc0e75 fix: use url_transparent 2024-08-23 09:18:01 +08:00
ChocoLZS
b3ebbaf8dd feat: fetch comments 2024-08-22 23:06:16 +08:00
ChocoLZS
1a16c62638 chore: rename regex and add referer 2024-08-22 19:51:05 +08:00
ChocoLZS
3fa3edc3c4 chore: use url instead of Request and add tests 2024-08-22 11:59:32 +08:00
ChocoLZS
edf1eebbce chore: extract id from m3u8 url 2024-08-21 23:55:59 +08:00
ChocoLZS
0639489bc5 feat: add embed player handler 2024-08-21 14:55:09 +08:00
ChocoLZS
9c2a6fa449 feat: add support for uliza in pia-live 2024-08-21 11:43:45 +08:00
5 changed files with 250 additions and 40 deletions

View File

@ -1518,8 +1518,12 @@ from .pgatour import PGATourIE
from .philharmoniedeparis import PhilharmonieDeParisIE from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE from .photobucket import PhotobucketIE
from .pialive import PiaLiveIE
from .piapro import PiaproIE from .piapro import PiaproIE
from .piaulizaportal import PIAULIZAPortalIE from .piaulizaportal import (
PIAULIZAPortalAPIIE,
PIAULIZAPortalIE,
)
from .picarto import ( from .picarto import (
PicartoIE, PicartoIE,
PicartoVodIE, PicartoVodIE,

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
}, },
}, },
], ],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}] }]
@staticmethod @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({ info['comments'].append({
'id': review.get('review_id'), 'id': review.get('review_id'),
'author': review.get('reviewer'), 'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')), 'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'}) 'parent': 'root'})

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get( return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj( dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str) vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url')))) mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info): def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story'] video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info) video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')): ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url: if not playable_url:
continue continue
if determine_ext(playable_url) == 'mpd': if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id)) formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(fmt_data, formats) extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats: if not formats:
# Do not append false positive entry w/o any formats # Do not append false positive entry w/o any formats
return return

126
yt_dlp/extractor/pialive.py Normal file
View File

@ -0,0 +1,126 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_class,
multipart_encode,
unified_timestamp,
url_or_none,
)
from ..utils.traversal import traverse_obj
class PiaLiveIE(InfoExtractor):
PLAYER_ROOT_URL = 'https://player.pia-live.jp/'
PIA_LIVE_API_URL = 'https://api.pia-live.jp'
API_KEY = 'kfds)FKFps-dms9e'
_VALID_URL = r'https?://player\.pia-live\.jp/stream/(?P<id>[\w-]+)'
_TESTS = [
{
'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krUDqGOwN4d61dCWQYOd6CTxl4hjya9dsfEZGsM4uGOUdax60lEI4twsXGXf7crmz8Gk__GhupTrWxA7RFRVt76',
'info_dict': {
'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84',
'display_id': '2431867_001',
'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)',
'live_status': 'was_live',
'comment_count': int,
},
'params': {
'getcomments': True,
'skip_download': True,
'ignore_no_formats_error': True,
},
'skip': 'The video is no longer available',
},
{
'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krJdu0GVBVbVy01IwpJ6J3qBEm3d9TCTt1d0eWpsZGj7DrOjVOmS7GAWGwyscMgiThopJvzgWC4H5b-7XQjAfRZ',
'info_dict': {
'id': '9ce8b8ba-f6d1-4d1f-83a0-18c3148ded93',
'display_id': '2431867_002',
'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)',
'live_status': 'was_live',
'comment_count': int,
},
'params': {
'getcomments': True,
'skip_download': True,
'ignore_no_formats_error': True,
},
'skip': 'The video is no longer available',
},
]
def _extract_vars(self, variable, html):
return self._search_regex(
rf'(?:var|const|let)\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
html, f'variable {variable}', group='value')
def _real_extract(self, url):
video_key = self._match_id(url)
webpage = self._download_webpage(url, video_key)
program_code = self._extract_vars('programCode', webpage)
article_code = self._extract_vars('articleCode', webpage)
title = self._html_extract_title(webpage)
if get_element_html_by_class('play-end', webpage):
raise ExtractorError('The video is no longer available', expected=True, video_id=program_code)
if start_info := clean_html(get_element_by_class('play-waiting__date', webpage)):
date, time = self._search_regex(
r'(?P<date>\d{4}/\d{1,2}/\d{1,2})\([月火水木金土日]\)(?P<time>\d{2}:\d{2})',
start_info, 'start_info', fatal=False, group=('date', 'time'))
if date and time:
release_timestamp_str = f'{date} {time} +09:00'
release_timestamp = unified_timestamp(release_timestamp_str)
self.raise_no_formats(f'The video will be available after {release_timestamp_str}', expected=True)
return {
'id': program_code,
'title': title,
'live_status': 'is_upcoming',
'release_timestamp': release_timestamp,
}
payload, content_type = multipart_encode({
'play_url': video_key,
'api_key': self.API_KEY,
})
api_kwargs = {
'video_id': program_code,
'data': payload,
'headers': {'Content-Type': content_type, 'Referer': self.PLAYER_ROOT_URL},
}
player_tag_list = self._download_json(
f'{self.PIA_LIVE_API_URL}/perf/player-tag-list/{program_code}', **api_kwargs,
note='Fetching player tag list', errnote='Unable to fetch player tag list')
chat_room_url = None
if self.get_param('getcomments'):
chat_room_url = traverse_obj(self._download_json(
f'{self.PIA_LIVE_API_URL}/perf/chat-tag-list/{program_code}/{article_code}', **api_kwargs,
note='Fetching chat info', errnote='Unable to fetch chat info', fatal=False),
('data', 'chat_one_tag', {extract_attributes}, 'src', {url_or_none}))
return self.url_result(
extract_attributes(player_tag_list['data']['movie_one_tag'])['src'], url_transparent=True,
video_title=title, display_id=program_code, __post_extractor=self.extract_comments(
program_code, chat_room_url))
def _get_comments(self, video_id, chat_room_url):
if not chat_room_url:
return
if comment_page := self._download_webpage(
chat_room_url, video_id, headers={'Referer': self.PLAYER_ROOT_URL},
note='Fetching comment page', errnote='Unable to fetch comment page', fatal=False):
yield from traverse_obj(self._search_json(
r'var\s+_history\s*=', comment_page, 'comment list',
video_id, contains_pattern=r'\[(?s:.+)\]', fatal=False), (..., {
'timestamp': 0,
'author_is_uploader': (1, {lambda x: x == 2}),
'author': 2,
'text': 3,
'id': 4,
}))

View File

@ -1,11 +1,62 @@
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import ExtractorError, int_or_none, parse_qs, time_seconds
ExtractorError, from ..utils.traversal import traverse_obj
int_or_none,
parse_qs,
time_seconds, class PIAULIZAPortalAPIIE(InfoExtractor):
traverse_obj, _VALID_URL = r'https://player-api\.p\.uliza\.jp/v1/players/[^?#]+\?(?:[^#]*&)?name=(?P<id>[^#&]+)'
) _TESTS = [
{
'url': 'https://player-api.p.uliza.jp/v1/players/timeshift-disabled/pia/admin?type=normal&playerobjectname=ulizaPlayer&name=livestream01_dvr&repeatable=true',
'info_dict': {
'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84',
'ext': 'mp4',
'title': '88f3109a-f503-4d0f-a9f7-9f39ac745d84',
'live_status': 'was_live',
},
},
{
'url': 'https://player-api.p.uliza.jp/v1/players/uliza_jp_gallery_normal/promotion/admin?type=presentation&name=cookings&targetid=player1',
'info_dict': {
'id': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800',
'ext': 'mp4',
'title': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800',
'live_status': 'not_live',
},
},
{
'url': 'https://player-api.p.uliza.jp/v1/players/default-player/pia/admin?type=normal&name=pia_movie_uliza_fix&targetid=ulizahtml5&repeatable=true',
'info_dict': {
'id': '0644ecc8-e354-41b4-b957-3b08a2d63df1',
'ext': 'mp4',
'title': '0644ecc8-e354-41b4-b957-3b08a2d63df1',
'live_status': 'not_live',
},
},
]
def _real_extract(self, url):
display_id = self._match_id(url)
player_data = self._download_webpage(
url, display_id, headers={'Referer': 'https://player-api.p.uliza.jp/'},
note='Fetching player data', errnote='Unable to fetch player data')
m3u8_url = self._search_regex(
r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data, 'm3u8 url')
video_id = parse_qs(m3u8_url).get('ss', [display_id])[0]
formats = self._extract_m3u8_formats(m3u8_url, video_id)
m3u8_type = self._search_regex(
r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None)
return {
'id': video_id,
'title': video_id,
'formats': formats,
'live_status': {
'video': 'is_live',
'dvr': 'was_live', # short-term archives
}.get(m3u8_type, 'not_live'), # VOD or long-term archives
}
class PIAULIZAPortalIE(InfoExtractor): class PIAULIZAPortalIE(InfoExtractor):
@ -14,7 +65,8 @@ class PIAULIZAPortalIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44', 'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44',
'info_dict': { 'info_dict': {
'id': '005f18b7-e810-5618-cb82-0987c5755d44', 'id': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800',
'display_id': '005f18b7-e810-5618-cb82-0987c5755d44',
'title': 'プレゼンテーションプレイヤーのサンプル', 'title': 'プレゼンテーションプレイヤーのサンプル',
'live_status': 'not_live', 'live_status': 'not_live',
}, },
@ -25,7 +77,8 @@ class PIAULIZAPortalIE(InfoExtractor):
}, { }, {
'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1', 'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1',
'info_dict': { 'info_dict': {
'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d', 'id': '0644ecc8-e354-41b4-b957-3b08a2d63df1',
'display_id': '005e1b23-fe93-5780-19a0-98e917cc4b7d',
'title': '【確認用】視聴サンプルページULIZA', 'title': '【確認用】視聴サンプルページULIZA',
'live_status': 'not_live', 'live_status': 'not_live',
}, },
@ -44,27 +97,9 @@ class PIAULIZAPortalIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
player_data = self._download_webpage( player_data_url = self._search_regex(
self._search_regex( r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"',
r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"', webpage, 'player data url')
webpage, 'player data url'), return self.url_result(
video_id, headers={'Referer': 'https://ulizaportal.jp/'}, player_data_url, PIAULIZAPortalAPIIE, url_transparent=True,
note='Fetching player data', errnote='Unable to fetch player data') display_id=video_id, video_title=self._html_extract_title(webpage))
formats = self._extract_m3u8_formats(
self._search_regex(
r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data,
'm3u8 url', default=None),
video_id, fatal=False)
m3u8_type = self._search_regex(
r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None)
return {
'id': video_id,
'title': self._html_extract_title(webpage),
'formats': formats,
'live_status': {
'video': 'is_live',
'dvr': 'was_live', # short-term archives
}.get(m3u8_type, 'not_live'), # VOD or long-term archives
}