Merge 9de472f6ea into f2a4983df7

[ie/archive.org] Fix comments extraction (#11527 )
Closes #11526 Authored by: jshumphrey
2024-11-26 01:01:25 +01:00 · 2024-11-15 01:20:37 +02:00 · 2024-11-12 23:26:18 +00:00 · 2024-11-12 23:23:10 +00:00 · 2024-04-01 15:11:16 +05:30 · 2024-04-01 14:46:37 +05:30
6 changed files with 79 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -612,8 +612,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git
    --no-restrict-filenames         Allow Unicode characters, "&" and spaces in
                                    filenames (default)
    --windows-filenames             Force filenames to be Windows-compatible
-    --no-windows-filenames          Make filenames Windows-compatible only if
-                                    using Windows (default)
+    --no-windows-filenames          Sanitize filenames only minimally
    --trim-filenames LENGTH         Limit the filename length (excluding
                                    extension) to the specified number of
                                    characters
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -762,6 +762,13 @@ class TestYoutubeDL(unittest.TestCase):
        test('%(width)06d.%%(ext)s', 'NA.%(ext)s')
        test('%%(width)06d.%(ext)s', '%(width)06d.mp4')

+        # Sanitization options
+        test('%(title3)s', (None, 'foo⧸bar⧹test'))
+        test('%(title5)s', (None, 'aei_A'), restrictfilenames=True)
+        test('%(title3)s', (None, 'foo_bar_test'), windowsfilenames=False, restrictfilenames=True)
+        if sys.platform != 'win32':
+            test('%(title3)s', (None, 'foo⧸bar\\test'), windowsfilenames=False)
+
        # ID sanitization
        test('%(id)s', '_abcd', info={'id': '_abcd'})
        test('%(some_id)s', '_abcd', info={'some_id': '_abcd'})
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -267,7 +267,9 @@ class YoutubeDL:
    outtmpl_na_placeholder: Placeholder for unavailable meta fields.
    restrictfilenames: Do not allow "&" and spaces in file names
    trim_file_name:    Limit length of filename (extension excluded)
-    windowsfilenames:  Force the filenames to be windows compatible
+    windowsfilenames:  True: Force filenames to be Windows compatible
+                       False: Sanitize filenames only minimally
+                       This option has no effect when running on Windows
    ignoreerrors:      Do not stop on download/postprocessing errors.
                       Can be 'only_download' to ignore only download errors.
                       Default is 'only_download' for CLI, but False for API
@ -1193,8 +1195,7 @@ class YoutubeDL:

    def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False):
        """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict
-        @param sanitize    Whether to sanitize the output as a filename.
-                           For backward compatibility, a function can also be passed
+        @param sanitize    Whether to sanitize the output as a filename
        """

        info_dict.setdefault('epoch', int(time.time()))  # keep epoch consistent once set
@ -1310,14 +1311,23 @@ class YoutubeDL:

        na = self.params.get('outtmpl_na_placeholder', 'NA')

-        def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
+        def filename_sanitizer(key, value, restricted):
            return sanitize_filename(str(value), restricted=restricted, is_id=(
                bool(re.search(r'(^|[_.])id(\.|$)', key))
                if 'filename-sanitization' in self.params['compat_opts']
                else NO_DEFAULT))

-        sanitizer = sanitize if callable(sanitize) else filename_sanitizer
-        sanitize = bool(sanitize)
+        if callable(sanitize):
+            self.deprecation_warning('Passing a callable "sanitize" to YoutubeDL.prepare_outtmpl is deprecated')
+        elif not sanitize:
+            pass
+        elif (sys.platform != 'win32' and not self.params.get('restrictfilenames')
+                and self.params.get('windowsfilenames') is False):
+            def sanitize(key, value):
+                return value.replace('/', '\u29F8').replace('\0', '')
+        else:
+            def sanitize(key, value):
+                return filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames'))

        def _dumpjson_default(obj):
            if isinstance(obj, (set, LazyList)):
@ -1400,13 +1410,13 @@ class YoutubeDL:

            if sanitize:
                # If value is an object, sanitize might convert it to a string
-                # So we convert it to repr first
+                # So we manually convert it before sanitizing
                if fmt[-1] == 'r':
                    value, fmt = repr(value), str_fmt
                elif fmt[-1] == 'a':
                    value, fmt = ascii(value), str_fmt
                if fmt[-1] in 'csra':
-                    value = sanitizer(last_field, value)
+                    value = sanitize(last_field, value)

            key = '{}\0{}'.format(key.replace('%', '%\0'), outer_mobj.group('format'))
            TMPL_DICT[key] = value
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
                },
            },
        ],
+    }, {
+        # The reviewbody is None for one of the reviews; just need to extract data without crashing
+        'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
+        'info_dict': {
+            'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
+            'ext': 'mp3',
+            'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
+            'creators': ['Grateful Dead'],
+            'duration': 338.31,
+            'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
+            'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
+            'display_id': 'gd95-04-02d1t04.shn',
+            'location': 'Pyramid Arena',
+            'uploader': 'jon@archive.org',
+            'album': '1995-04-02 - Pyramid Arena',
+            'upload_date': '20040519',
+            'track_number': 4,
+            'release_date': '19950402',
+            'timestamp': 1084927901,
+        },
    }]

    @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
                info['comments'].append({
                    'id': review.get('review_id'),
                    'author': review.get('reviewer'),
-                    'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
+                    'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
                    'timestamp': unified_timestamp(review.get('createdate')),
                    'parent': 'root'})

--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
                return extract_video_data(try_get(
                    js_data, lambda x: x['jsmods']['instances'], list) or [])

-        def extract_dash_manifest(video, formats):
+        def extract_dash_manifest(vid_data, formats, mpd_url=None):
            dash_manifest = traverse_obj(
-                video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str)
+                vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
            if dash_manifest:
                formats.extend(self._parse_mpd_formats(
                    compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
-                    mpd_url=url_or_none(video.get('dash_manifest_url'))))
+                    mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))

        def process_formats(info):
            # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
                        video = video['creation_story']
                        video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
                        video.update(reel_info)
-                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
+
                    formats = []
                    q = qualities(['sd', 'hd'])
+
+                    # Legacy formats extraction
+                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
                    for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
                                           ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
                                           ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
                        if not playable_url:
                            continue
                        if determine_ext(playable_url) == 'mpd':
-                            formats.extend(self._extract_mpd_formats(playable_url, video_id))
+                            formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
                        else:
                            formats.append({
                                'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
                                'url': playable_url,
                            })
                    extract_dash_manifest(fmt_data, formats)
+
+                    # New videoDeliveryResponse formats extraction
+                    fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
+                    mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
+                    dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
+                    for idx, dash_manifest in enumerate(dash_manifests):
+                        extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
+                    if not dash_manifests:
+                        # Only extract from MPD URLs if the manifests are not already provided
+                        for mpd_url in mpd_urls:
+                            formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
+                    for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
+                        format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
+                        formats.append({
+                            'format_id': format_id,
+                            # sd, hd formats w/o resolution info should be deprioritized below DASH
+                            'quality': q(format_id) - 3,
+                            'url': prog_fmt['progressive_url'],
+                        })
+                    for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
+                        formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
+
                    if not formats:
                        # Do not append false positive entry w/o any formats
                        return
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -1368,12 +1368,12 @@ def create_parser():
        help='Allow Unicode characters, "&" and spaces in filenames (default)')
    filesystem.add_option(
        '--windows-filenames',
-        action='store_true', dest='windowsfilenames', default=False,
+        action='store_true', dest='windowsfilenames', default=None,
        help='Force filenames to be Windows-compatible')
    filesystem.add_option(
        '--no-windows-filenames',
        action='store_false', dest='windowsfilenames',
-        help='Make filenames Windows-compatible only if using Windows (default)')
+        help='Sanitize filenames only minimally')
    filesystem.add_option(
        '--trim-filenames', '--trim-file-names', metavar='LENGTH',
        dest='trim_file_name', default=0, type=int,
Author	SHA1	Message	Date
pukkandan	878c5f1fe7	Merge `9de472f6ea` into `f2a4983df7`	2024-11-15 01:20:37 +02:00
Jackson Humphrey	f2a4983df7	[ie/archive.org] Fix comments extraction (#11527 ) Closes #11526 Authored by: jshumphrey	2024-11-12 23:26:18 +00:00
bashonly	bacc31b05a	[ie/facebook] Fix formats extraction (#11513 ) Closes #11497 Authored by: bashonly	2024-11-12 23:23:10 +00:00
pukkandan	9de472f6ea	Add tests	2024-04-01 15:11:16 +05:30
pukkandan	0f58c5628a	Reduce filename sanitization when `--no-windows-filenames` is given	2024-04-01 14:46:37 +05:30