Compare commits

...

7 Commits

Author SHA1 Message Date
garret1317
261008bcc6
Merge 696bf76b8b into f2a4983df7 2024-11-12 23:26:23 +00:00
Jackson Humphrey
f2a4983df7
[ie/archive.org] Fix comments extraction (#11527)
Closes #11526
Authored by: jshumphrey
2024-11-12 23:26:18 +00:00
bashonly
bacc31b05a
[ie/facebook] Fix formats extraction (#11513)
Closes #11497
Authored by: bashonly
2024-11-12 23:23:10 +00:00
garret1317
696bf76b8b better words 2024-11-07 18:15:45 +00:00
garret1317
f2821be22b Add support for username/password login 2024-11-07 17:38:26 +00:00
garret1317
5c4f4e7729 Add error messages when programme is not available 2024-11-07 15:20:03 +00:00
garret
6b883e15c8 [ie/radiko] rough patch to make timefree 30 possible
naive approach that assumes things dont do anything unusual
2024-11-01 16:11:42 +00:00
3 changed files with 104 additions and 10 deletions

View File

@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
}, },
}, },
], ],
}, {
# The reviewbody is None for one of the reviews; just need to extract data without crashing
'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'info_dict': {
'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
'ext': 'mp3',
'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
'creators': ['Grateful Dead'],
'duration': 338.31,
'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
'display_id': 'gd95-04-02d1t04.shn',
'location': 'Pyramid Arena',
'uploader': 'jon@archive.org',
'album': '1995-04-02 - Pyramid Arena',
'upload_date': '20040519',
'track_number': 4,
'release_date': '19950402',
'timestamp': 1084927901,
},
}] }]
@staticmethod @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
info['comments'].append({ info['comments'].append({
'id': review.get('review_id'), 'id': review.get('review_id'),
'author': review.get('reviewer'), 'author': review.get('reviewer'),
'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'), 'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
'timestamp': unified_timestamp(review.get('createdate')), 'timestamp': unified_timestamp(review.get('createdate')),
'parent': 'root'}) 'parent': 'root'})

View File

@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
return extract_video_data(try_get( return extract_video_data(try_get(
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(vid_data, formats, mpd_url=None):
dash_manifest = traverse_obj( dash_manifest = traverse_obj(
video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str) vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
mpd_url=url_or_none(video.get('dash_manifest_url')))) mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))
def process_formats(info): def process_formats(info):
# Downloads with browser's User-Agent are rate limited. Working around # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
video = video['creation_story'] video = video['creation_story']
video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner')) video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
video.update(reel_info) video.update(reel_info)
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
formats = [] formats = []
q = qualities(['sd', 'hd']) q = qualities(['sd', 'hd'])
# Legacy formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
('browser_native_sd_url', 'sd')): ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
if not playable_url: if not playable_url:
continue continue
if determine_ext(playable_url) == 'mpd': if determine_ext(playable_url) == 'mpd':
formats.extend(self._extract_mpd_formats(playable_url, video_id)) formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
'url': playable_url, 'url': playable_url,
}) })
extract_dash_manifest(fmt_data, formats) extract_dash_manifest(fmt_data, formats)
# New videoDeliveryResponse formats extraction
fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
for idx, dash_manifest in enumerate(dash_manifests):
extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
if not dash_manifests:
# Only extract from MPD URLs if the manifests are not already provided
for mpd_url in mpd_urls:
formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
formats.append({
'format_id': format_id,
# sd, hd formats w/o resolution info should be deprioritized below DASH
'quality': q(format_id) - 3,
'url': prog_fmt['progressive_url'],
})
for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
if not formats: if not formats:
# Do not append false positive entry w/o any formats # Do not append false positive entry w/o any formats
return return

View File

@ -1,9 +1,11 @@
import base64 import base64
import datetime
import random import random
import re import re
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
clean_html, clean_html,
@ -12,12 +14,14 @@ from ..utils import (
try_call, try_call,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
urlencode_postdata,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import traverse_obj
class RadikoBaseIE(InfoExtractor): class RadikoBaseIE(InfoExtractor):
_GEO_BYPASS = False _GEO_BYPASS = False
_NETRC_MACHINE = 'radiko'
_FULL_KEY = None _FULL_KEY = None
_HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = ( _HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = (
'https://c-rpaa.smartstream.ne.jp', 'https://c-rpaa.smartstream.ne.jp',
@ -37,6 +41,29 @@ class RadikoBaseIE(InfoExtractor):
'https://c-radiko.smartstream.ne.jp', 'https://c-radiko.smartstream.ne.jp',
) )
_JST = datetime.timezone(datetime.timedelta(hours=9))
_has_tf30 = None
def _perform_login(self, username, password):
try:
login_info = self._download_json('https://radiko.jp/ap/member/webapi/member/login', None, note='Logging in',
data=urlencode_postdata({'mail': username, 'pass': password}))
self._has_tf30 = '2' in login_info.get('privileges')
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 401:
raise ExtractorError('Invalid username and/or password', expected=True)
raise
def _check_tf30(self):
if self._has_tf30 is not None:
return self._has_tf30
if self._get_cookies('https://radiko.jp').get('radiko_session') is None:
return
account_info = self._download_json('https://radiko.jp/ap/member/webapi/v2/member/login/check',
None, note='Checking account status from cookies', expected_status=400)
self._has_tf30 = account_info.get('timefreeplus') == '1'
return self._has_tf30
def _negotiate_token(self): def _negotiate_token(self):
_, auth1_handle = self._download_webpage_handle( _, auth1_handle = self._download_webpage_handle(
'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page',
@ -99,17 +126,39 @@ class RadikoBaseIE(InfoExtractor):
self._FULL_KEY = full_key self._FULL_KEY = full_key
return full_key return full_key
def _get_broadcast_day(self, timestring):
dt = datetime.datetime.strptime(timestring, '%Y%m%d%H%M%S')
if dt.hour < 5:
dt -= datetime.timedelta(days=1)
return dt
def _get_broadcast_day_end(self, dt):
dt += datetime.timedelta(days=1)
return datetime.datetime(dt.year, dt.month, dt.day, 5, 0, 0, tzinfo=self._JST)
def _find_program(self, video_id, station, cursor): def _find_program(self, video_id, station, cursor):
broadcast_day = self._get_broadcast_day(cursor)
broadcast_day_str = broadcast_day.strftime('%Y%m%d')
broadcast_day_end = self._get_broadcast_day_end(broadcast_day)
now = datetime.datetime.now(tz=self._JST)
if broadcast_day_end + datetime.timedelta(days=30) < now:
self.raise_no_formats('Programme is no longer available.', video_id=video_id, expected=True)
elif broadcast_day_end + datetime.timedelta(days=7) < now and not self._check_tf30():
self.raise_login_required('Programme is only available with a Timefree 30 subscription',
metadata_available=True)
station_program = self._download_xml( station_program = self._download_xml(
f'https://radiko.jp/v3/program/station/weekly/{station}.xml', video_id, f'https://api.radiko.jp/program/v3/date/{broadcast_day_str}/station/{station}.xml', station,
note=f'Downloading radio program for {station} station') note=f'Downloading programme information for {broadcast_day_str}')
prog = None prog = None
for p in station_program.findall('.//prog'): for p in station_program.findall('.//prog'):
ft_str, to_str = p.attrib['ft'], p.attrib['to'] ft_str, to_str = p.attrib['ft'], p.attrib['to']
ft = unified_timestamp(ft_str, False) ft = unified_timestamp(ft_str, False)
to = unified_timestamp(to_str, False) to = unified_timestamp(to_str, False)
if ft <= cursor and cursor < to: if ft_str <= cursor and cursor < to_str:
prog = p prog = p
break break
if not prog: if not prog:
@ -187,7 +236,7 @@ class RadikoIE(RadikoBaseIE):
station, timestring = self._match_valid_url(url).group('station', 'timestring') station, timestring = self._match_valid_url(url).group('station', 'timestring')
video_id = join_nonempty(station, timestring) video_id = join_nonempty(station, timestring)
vid_int = unified_timestamp(timestring, False) vid_int = unified_timestamp(timestring, False)
prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, timestring)
auth_token, area_id = self._auth_client() auth_token, area_id = self._auth_client()