Compare commits

...

3 Commits

Author SHA1 Message Date
Benjamin Krausse
38e84a6eca Final fixups after implementing bashonly's suggestions; adding media_type field (possibly incomplete) 2023-11-24 13:18:28 +01:00
Benjamin Krausse
daa0af541e Merge branch 'theplatform' of https://github.com/trainman261/yt-dlp into theplatform 2023-11-23 22:16:44 +01:00
Benjamin Krausse
c0efdb80d0 simplifying the creator, categories and tags fields according to bashonly's suggestions
Adding the media_type field in common. This has to be reviewed, as I don't know where else this has to be added.
2023-11-23 22:15:07 +01:00
4 changed files with 21 additions and 36 deletions

View File

@ -197,6 +197,7 @@ class CBCPlayerIE(InfoExtractor):
'series': 'All in a Weekend Montreal', 'series': 'All in a Weekend Montreal',
'season': 'Season 2015', 'season': 'Season 2015',
'season_number': 2015, 'season_number': 2015,
'media_type': 'Excerpt',
}, },
}, { }, {
'url': 'http://www.cbc.ca/player/play/2164402062', 'url': 'http://www.cbc.ca/player/play/2164402062',
@ -221,6 +222,7 @@ class CBCPlayerIE(InfoExtractor):
'cancer', 'cancer',
], ],
'creator': 'Allison Johnson', 'creator': 'Allison Johnson',
'media_type': 'Excerpt',
}, },
}, { }, {
# Has subtitles # Has subtitles
@ -245,6 +247,7 @@ class CBCPlayerIE(InfoExtractor):
'tags': 'count:1', 'tags': 'count:1',
'creator': 'News', 'creator': 'News',
'location': 'Canada', 'location': 'Canada',
'media_type': 'Full Program',
}, },
}] }]

View File

@ -379,6 +379,8 @@ class InfoExtractor:
'private', 'premium_only', 'subscriber_only', 'needs_auth', 'private', 'premium_only', 'subscriber_only', 'needs_auth',
'unlisted' or 'public'. Use 'InfoExtractor._availability' 'unlisted' or 'public'. Use 'InfoExtractor._availability'
to set it to set it
media_type: The type of media, for instance a full show, an excerpt, a highlight, a trailer
or something similar that classifies what kind of video or audio this is.
_old_archive_ids: A list of old archive ids needed for backward compatibility _old_archive_ids: A list of old archive ids needed for backward compatibility
_format_sort_fields: A list of fields to use for sorting formats _format_sort_fields: A list of fields to use for sorting formats
__post_extractor: A function to be called just before the metadata is __post_extractor: A function to be called just before the metadata is

View File

@ -57,6 +57,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'Series/The Tonight Show Starring Jimmy Fallon' 'Series/The Tonight Show Starring Jimmy Fallon'
], ],
'creator': None, 'creator': None,
'media_type': 'Full Episode',
}, },
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',
@ -137,7 +138,8 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
'thumbnail': r're:https?://.+\.jpg', 'thumbnail': r're:https?://.+\.jpg',
'categories': [ 'categories': [
'Series/Quantum Leap 2022', 'Series/Quantum Leap 2022',
] ],
'media_type': 'Highlight',
}, },
'params': { 'params': {
'skip_download': 'm3u8', 'skip_download': 'm3u8',

View File

@ -13,7 +13,6 @@ from ..utils import (
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none, int_or_none,
str_or_none,
parse_qs, parse_qs,
unsmuggle_url, unsmuggle_url,
update_url_query, update_url_query,
@ -23,7 +22,6 @@ from ..utils import (
traverse_obj, traverse_obj,
update_url, update_url,
urlhandle_detect_ext, urlhandle_detect_ext,
str_to_int,
) )
from ..networking import HEADRequest from ..networking import HEADRequest
@ -105,34 +103,14 @@ class ThePlatformBaseIE(OnceIE):
for chapter in tp_chapters[:-1]: for chapter in tp_chapters[:-1]:
_add_chapter(chapter.get('startTime'), chapter.get('endTime')) _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
_add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
info_keywords_str = info.get('keywords', {str_or_none})
tags = []
if (info_keywords_str is not None) and (info_keywords_str != ''):
tags = re.split(', |; |,', info_keywords_str)
location = None
series = None
season_number = None
# The following can be uncommented as soon as #7838 is merged:
# media_type = None
categories = []
categories_data = info.get('categories') or []
for x in categories_data:
if x.get('name') is not None:
# Sometimes, there will be several kinds of categories
# in this case, it will have a label field with a value 'category'
if (x.get('label') is None) or (x.get('label') == 'category'):
categories.append(x.get('name'))
def extract_site_specific_field(field): def extract_site_specific_field(field):
# A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber' # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
return next((info[k] for k in info if k.endswith(f'${field}')), None) return next((info[k] for k in info if k.endswith(f'${field}') and info[k] != ''), None)
location = extract_site_specific_field('region') info_media_type = extract_site_specific_field('programmingType')
series = extract_site_specific_field('show') if not info_media_type:
season_number = int_or_none(extract_site_specific_field('seasonNumber')) info_media_type = extract_site_specific_field('type')
# the following can be uncommented as soon as #7838 is merged:
# if (re.match('.*\programmingType', key)) or (re.match('.*\type', key)):
# media_type = info[key]
return { return {
'title': info['title'], 'title': info['title'],
@ -143,14 +121,14 @@ class ThePlatformBaseIE(OnceIE):
'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'), 'uploader': info.get('billingCode'),
'chapters': chapters, 'chapters': chapters,
'creator': info.get('author', {str_or_none}) if info.get('author', {str_or_none}) != '' else None, 'creator': traverse_obj(info, ('author', {str})) or None,
'categories': categories if len(categories) != 0 else None, 'categories': traverse_obj(info, (
'tags': tags if len(tags) != 0 else None, 'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
'location': str_or_none(location) if location != '' else None, 'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
'series': str_or_none(series) if series != '' else None, 'location': extract_site_specific_field('region'),
'season_number': int_or_none(season_number), 'series': extract_site_specific_field('show'),
# The following can be uncommented as soon as #7838 is merged and the matching line above is uncommented 'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
# 'media_type': media_type 'media_type': info_media_type,
} }
def _extract_theplatform_metadata(self, path, video_id): def _extract_theplatform_metadata(self, path, video_id):