Compare commits

..

4 Commits

Author SHA1 Message Date
bashonly
009ab8646c
better episode number extraction
Authored by: bashonly
2024-01-29 14:33:03 -06:00
bashonly
6dadce6529
Add /embed/player/ test
Authored by: bashonly
2024-01-29 14:13:46 -06:00
bashonly
9211e5d9e7
Add player page note
Authored by: bashonly
2024-01-29 14:07:11 -06:00
bashonly
84b51653b4
simplify
Authored by: bashonly
2024-01-29 13:59:54 -06:00

View File

@ -5,6 +5,7 @@ from ..utils import (
merge_dicts, merge_dicts,
parse_count, parse_count,
url_or_none, url_or_none,
urljoin,
) )
from ..utils.traversal import traverse_obj from ..utils.traversal import traverse_obj
@ -18,7 +19,7 @@ class NFBBaseIE(InfoExtractor):
r'const\s+episodesData\s*=', webpage, 'episode data', video_id, r'const\s+episodesData\s*=', webpage, 'episode data', video_id,
contains_pattern=r'\[\s*{(?s:.+)}\s*\]', fatal=fatal) or [] contains_pattern=r'\[\s*{(?s:.+)}\s*\]', fatal=fatal) or []
def _extract_ep_info(self, data, video_id): def _extract_ep_info(self, data, video_id, slug=None):
info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], { info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], {
'description': ('description', {str}), 'description': ('description', {str}),
'thumbnail': ('thumbnail_url', {url_or_none}), 'thumbnail': ('thumbnail_url', {url_or_none}),
@ -35,7 +36,7 @@ class NFBBaseIE(InfoExtractor):
'id': video_id, 'id': video_id,
'title': join_nonempty('series', 'episode', from_dict=info, delim=' - '), 'title': join_nonempty('series', 'episode', from_dict=info, delim=' - '),
'episode_number': int_or_none(self._search_regex( 'episode_number': int_or_none(self._search_regex(
r'-e(?:pisode)?-?(\d+)(?:-|$)', video_id, 'episode number', default=None)), r'[/-]e(?:pisode)?-?(\d+)(?:[/-]|$)', slug or video_id, 'episode number', default=None)),
} }
@ -200,29 +201,33 @@ class NFBIE(NFBBaseIE):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
}, },
'params': {'skip_download': 'm3u8'}, 'params': {'skip_download': 'm3u8'},
}, {
'note': 'NFB film /embed/player/ page',
'url': 'https://www.nfb.ca/film/afterlife/embed/player/',
'info_dict': {
'id': 'afterlife',
'ext': 'mp4',
'title': 'Afterlife',
'description': 'md5:84951394f594f1fb1e62d9c43242fdf5',
'release_year': 1978,
'duration': 420.0,
'uploader': 'Ishu Patel',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {'skip_download': 'm3u8'},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
site, type_, slug = self._match_valid_url(url).group('site', 'type', 'id') site, type_, slug = self._match_valid_url(url).group('site', 'type', 'id')
# Need to construct the URL since we match /embed/player/ URLs as well
webpage, urlh = self._download_webpage_handle(f'https://www.{site}.ca/{type_}/{slug}/', slug) webpage, urlh = self._download_webpage_handle(f'https://www.{site}.ca/{type_}/{slug}/', slug)
json_ld = self._yield_json_ld(webpage, slug) # type_ can change from film to serie(s) after redirect; new slug may have episode number
type_, slug = self._match_valid_url(urlh.url).group('type', 'id')
# /film/ URLs have unique slugs used in the embed url embed_url = urljoin(f'https://www.{site}.ca', self._html_search_regex(
video_id = slug if type_ == 'film' else traverse_obj( r'<[^>]+\bid=["\']player-iframe["\'][^>]*\bsrc=["\']([^"\']+)', webpage, 'embed url'))
json_ld, (lambda _, v: 'VideoObject' in v['@type'], 'embedUrl', {self._match_id}), video_id = self._match_id(embed_url) # embed url has unique slug
get_all=False) or self._match_id(self._og_search_property('url', webpage, 'video id')) player = self._download_webpage(embed_url, video_id, 'Downloading player page')
# type_ may have changed from film to serie(s) after redirect
type_ = self._match_valid_url(urlh.url).group('type')
player = self._download_webpage(
f'https://www.{site}.ca/film/{video_id}/embed/player/', video_id,
'Downloading player page', query={
'player_mode': '',
'embed_mode': '0',
'auto_focus': '1',
'context_type': type_ if type_ == 'film' else 'episode',
})
if 'MESSAGE_GEOBLOCKED' in player: if 'MESSAGE_GEOBLOCKED' in player:
self.raise_geo_restricted(countries=self._GEO_COUNTRIES) self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
@ -244,12 +249,12 @@ class NFBIE(NFBBaseIE):
'release_year': int_or_none(self._html_search_regex( 'release_year': int_or_none(self._html_search_regex(
r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)', r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)',
webpage, 'release_year', default=None)), webpage, 'release_year', default=None)),
} if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id), video_id) } if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id)
return merge_dicts({ return merge_dicts({
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
}, info, self._json_ld(json_ld, video_id)) }, info, self._search_json_ld(webpage, video_id, default={}))
class NFBSeriesIE(NFBBaseIE): class NFBSeriesIE(NFBBaseIE):