Final fixups after implementing bashonly's suggestions; adding media_type field (possibly incomplete)

Merge branch 'theplatform' of https://github.com/trainman261/yt-dlp into theplatform
simplifying the creator, categories and tags fields according to bashonly's suggestions
2024-11-24 08:11:31 +01:00 · 2023-11-24 13:18:28 +01:00 · 2023-11-23 22:16:44 +01:00 · 2023-11-23 22:15:07 +01:00
4 changed files with 21 additions and 36 deletions
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@ -197,6 +197,7 @@ class CBCPlayerIE(InfoExtractor):
            'series': 'All in a Weekend Montreal',
            'season': 'Season 2015',
            'season_number': 2015,
+            'media_type': 'Excerpt',
        },
    }, {
        'url': 'http://www.cbc.ca/player/play/2164402062',
@ -221,6 +222,7 @@ class CBCPlayerIE(InfoExtractor):
                'cancer',
            ],
            'creator': 'Allison Johnson',
+            'media_type': 'Excerpt',
        },
    }, {
        # Has subtitles
@ -245,6 +247,7 @@ class CBCPlayerIE(InfoExtractor):
            'tags': 'count:1',
            'creator': 'News',
            'location': 'Canada',
+            'media_type': 'Full Program',
        },
    }]

--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -379,6 +379,8 @@ class InfoExtractor:
                    'private', 'premium_only', 'subscriber_only', 'needs_auth',
                    'unlisted' or 'public'. Use 'InfoExtractor._availability'
                    to set it
+    media_type:     The type of media, for instance a full show, an excerpt, a highlight, a trailer
+                    or something similar that classifies what kind of video or audio this is.
    _old_archive_ids: A list of old archive ids needed for backward compatibility
    _format_sort_fields: A list of fields to use for sorting formats
    __post_extractor: A function to be called just before the metadata is
--- a/yt_dlp/extractor/nbc.py
+++ b/yt_dlp/extractor/nbc.py
@ -57,6 +57,7 @@ class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
                    'Series/The Tonight Show Starring Jimmy Fallon'
                ],
                'creator': None,
+                'media_type': 'Full Episode',
            },
            'params': {
                'skip_download': 'm3u8',
@ -137,7 +138,8 @@ class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
                'thumbnail': r're:https?://.+\.jpg',
                'categories': [
                    'Series/Quantum Leap 2022',
-                ]
+                ],
+                'media_type': 'Highlight',
            },
            'params': {
                'skip_download': 'm3u8',
--- a/yt_dlp/extractor/theplatform.py
+++ b/yt_dlp/extractor/theplatform.py
@ -13,7 +13,6 @@ from ..utils import (
    ExtractorError,
    float_or_none,
    int_or_none,
-    str_or_none,
    parse_qs,
    unsmuggle_url,
    update_url_query,
@ -23,7 +22,6 @@ from ..utils import (
    traverse_obj,
    update_url,
    urlhandle_detect_ext,
-    str_to_int,
 )
 from ..networking import HEADRequest

@ -105,34 +103,14 @@ class ThePlatformBaseIE(OnceIE):
            for chapter in tp_chapters[:-1]:
                _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
            _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
-        info_keywords_str = info.get('keywords', {str_or_none})
-        tags = []
-        if (info_keywords_str is not None) and (info_keywords_str != ''):
-            tags = re.split(', |; |,', info_keywords_str)
-        location = None
-        series = None
-        season_number = None
-        # The following can be uncommented as soon as #7838 is merged:
-        # media_type = None
-        categories = []
-        categories_data = info.get('categories') or []
-        for x in categories_data:
-            if x.get('name') is not None:
-                # Sometimes, there will be several kinds of categories
-                # in this case, it will have a label field with a value 'category'
-                if (x.get('label') is None) or (x.get('label') == 'category'):
-                    categories.append(x.get('name'))

        def extract_site_specific_field(field):
            # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
-            return next((info[k] for k in info if k.endswith(f'${field}')), None)
-        
-        location = extract_site_specific_field('region')
-        series = extract_site_specific_field('show')
-        season_number = int_or_none(extract_site_specific_field('seasonNumber'))
-            # the following can be uncommented as soon as #7838 is merged:
-            # if (re.match('.*\programmingType', key)) or (re.match('.*\type', key)):
-                # media_type = info[key]
+            return next((info[k] for k in info if k.endswith(f'${field}') and info[k] != ''), None)
+
+        info_media_type = extract_site_specific_field('programmingType')
+        if not info_media_type:
+            info_media_type = extract_site_specific_field('type')

        return {
            'title': info['title'],
@ -143,14 +121,14 @@ class ThePlatformBaseIE(OnceIE):
            'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
            'uploader': info.get('billingCode'),
            'chapters': chapters,
-            'creator': info.get('author', {str_or_none}) if info.get('author', {str_or_none}) != '' else None,
-            'categories': categories if len(categories) != 0 else None,
-            'tags': tags if len(tags) != 0 else None,
-            'location': str_or_none(location) if location != '' else None,
-            'series': str_or_none(series) if series != '' else None,
-            'season_number': int_or_none(season_number),
-            # The following can be uncommented as soon as #7838 is merged and the matching line above is uncommented
-            # 'media_type': media_type
+            'creator': traverse_obj(info, ('author', {str})) or None,
+            'categories': traverse_obj(info, (
+                'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
+            'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
+            'location': extract_site_specific_field('region'),
+            'series': extract_site_specific_field('show'),
+            'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
+            'media_type': info_media_type,
        }

    def _extract_theplatform_metadata(self, path, video_id):
Author	SHA1	Message	Date
Benjamin Krausse	38e84a6eca	Final fixups after implementing bashonly's suggestions; adding media_type field (possibly incomplete)	2023-11-24 13:18:28 +01:00
Benjamin Krausse	daa0af541e	Merge branch 'theplatform' of https://github.com/trainman261/yt-dlp into theplatform	2023-11-23 22:16:44 +01:00
Benjamin Krausse	c0efdb80d0	simplifying the creator, categories and tags fields according to bashonly's suggestions Adding the media_type field in common. This has to be reviewed, as I don't know where else this has to be added.	2023-11-23 22:15:07 +01:00