Merge 1cae3bf46d into f2a4983df7

[ie/archive.org] Fix comments extraction (#11527 )
Closes #11526 Authored by: jshumphrey
2024-11-26 09:11:25 +01:00 · 2024-11-13 02:33:12 +00:00 · 2024-11-12 23:26:18 +00:00 · 2024-11-12 23:23:10 +00:00 · 2024-11-08 03:52:50 +01:00 · 2024-11-08 03:49:39 +01:00
3 changed files with 117 additions and 34 deletions
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@ -205,6 +205,26 @@ class ArchiveOrgIE(InfoExtractor):
                },
            },
        ],
+    }, {
+        # The reviewbody is None for one of the reviews; just need to extract data without crashing
+        'url': 'https://archive.org/details/gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
+        'info_dict': {
+            'id': 'gd95-04-02.sbd.11622.sbeok.shnf/gd95-04-02d1t04.shn',
+            'ext': 'mp3',
+            'title': 'Stuck Inside of Mobile with the Memphis Blues Again',
+            'creators': ['Grateful Dead'],
+            'duration': 338.31,
+            'track': 'Stuck Inside of Mobile with the Memphis Blues Again',
+            'description': 'md5:764348a470b986f1217ffd38d6ac7b72',
+            'display_id': 'gd95-04-02d1t04.shn',
+            'location': 'Pyramid Arena',
+            'uploader': 'jon@archive.org',
+            'album': '1995-04-02 - Pyramid Arena',
+            'upload_date': '20040519',
+            'track_number': 4,
+            'release_date': '19950402',
+            'timestamp': 1084927901,
+        },
    }]

    @staticmethod
@ -335,7 +355,7 @@ class ArchiveOrgIE(InfoExtractor):
                info['comments'].append({
                    'id': review.get('review_id'),
                    'author': review.get('reviewer'),
-                    'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
+                    'text': join_nonempty('reviewtitle', 'reviewbody', from_dict=review, delim='\n\n'),
                    'timestamp': unified_timestamp(review.get('createdate')),
                    'parent': 'root'})

--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@ -563,13 +563,13 @@ class FacebookIE(InfoExtractor):
                return extract_video_data(try_get(
                    js_data, lambda x: x['jsmods']['instances'], list) or [])

-        def extract_dash_manifest(video, formats):
+        def extract_dash_manifest(vid_data, formats, mpd_url=None):
            dash_manifest = traverse_obj(
-                video, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', expected_type=str)
+                vid_data, 'dash_manifest', 'playlist', 'dash_manifest_xml_string', 'manifest_xml', expected_type=str)
            if dash_manifest:
                formats.extend(self._parse_mpd_formats(
                    compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),
-                    mpd_url=url_or_none(video.get('dash_manifest_url'))))
+                    mpd_url=url_or_none(video.get('dash_manifest_url')) or mpd_url))

        def process_formats(info):
            # Downloads with browser's User-Agent are rate limited. Working around
@ -619,9 +619,12 @@ class FacebookIE(InfoExtractor):
                        video = video['creation_story']
                        video['owner'] = traverse_obj(video, ('short_form_video_context', 'video_owner'))
                        video.update(reel_info)
-                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
+
                    formats = []
                    q = qualities(['sd', 'hd'])
+
+                    # Legacy formats extraction
+                    fmt_data = traverse_obj(video, ('videoDeliveryLegacyFields', {dict})) or video
                    for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'),
                                           ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'),
                                           ('browser_native_sd_url', 'sd')):
@ -629,7 +632,7 @@ class FacebookIE(InfoExtractor):
                        if not playable_url:
                            continue
                        if determine_ext(playable_url) == 'mpd':
-                            formats.extend(self._extract_mpd_formats(playable_url, video_id))
+                            formats.extend(self._extract_mpd_formats(playable_url, video_id, fatal=False))
                        else:
                            formats.append({
                                'format_id': format_id,
@ -638,6 +641,28 @@ class FacebookIE(InfoExtractor):
                                'url': playable_url,
                            })
                    extract_dash_manifest(fmt_data, formats)
+
+                    # New videoDeliveryResponse formats extraction
+                    fmt_data = traverse_obj(video, ('videoDeliveryResponseFragment', 'videoDeliveryResponseResult'))
+                    mpd_urls = traverse_obj(fmt_data, ('dash_manifest_urls', ..., 'manifest_url', {url_or_none}))
+                    dash_manifests = traverse_obj(fmt_data, ('dash_manifests', lambda _, v: v['manifest_xml']))
+                    for idx, dash_manifest in enumerate(dash_manifests):
+                        extract_dash_manifest(dash_manifest, formats, mpd_url=traverse_obj(mpd_urls, idx))
+                    if not dash_manifests:
+                        # Only extract from MPD URLs if the manifests are not already provided
+                        for mpd_url in mpd_urls:
+                            formats.extend(self._extract_mpd_formats(mpd_url, video_id, fatal=False))
+                    for prog_fmt in traverse_obj(fmt_data, ('progressive_urls', lambda _, v: v['progressive_url'])):
+                        format_id = traverse_obj(prog_fmt, ('metadata', 'quality', {str.lower}))
+                        formats.append({
+                            'format_id': format_id,
+                            # sd, hd formats w/o resolution info should be deprioritized below DASH
+                            'quality': q(format_id) - 3,
+                            'url': prog_fmt['progressive_url'],
+                        })
+                    for m3u8_url in traverse_obj(fmt_data, ('hls_playlist_urls', ..., 'hls_playlist_url', {url_or_none})):
+                        formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', fatal=False, m3u8_id='hls'))
+
                    if not formats:
                        # Do not append false positive entry w/o any formats
                        return
--- a/yt_dlp/postprocessor/ffmpeg.py
+++ b/yt_dlp/postprocessor/ffmpeg.py
@ -220,9 +220,20 @@ class FFmpegPostProcessor(PostProcessor):
    @staticmethod
    def stream_copy_opts(copy=True, *, ext=None):
        yield from ('-map', '0')
+
+        if ext in ('mkv', 'mka'):
+            # Some streams, such as JSON attachments, are considered of unknown
+            # type by FFmpeg but we still want to copy them.
+            yield '-copy_unknown'
+        else:
+            # Most containers don't really like unknown streams. Let's make
+            # sure to get rid of them.
+            yield '-ignore_unknown'
+
        # Don't copy Apple TV chapters track, bin_data
        # See https://github.com/yt-dlp/yt-dlp/issues/2, #19042, #19024, https://trac.ffmpeg.org/ticket/6016
-        yield from ('-dn', '-ignore_unknown')
+        yield '-dn'
+
        if copy:
            yield from ('-c', 'copy')
        if ext in ('mp4', 'mov', 'm4a'):
@ -557,7 +568,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):

    @staticmethod
    def _options(target_ext):
-        yield from FFmpegPostProcessor.stream_copy_opts(False)
+        yield from FFmpegPostProcessor.stream_copy_opts(False, ext=target_ext)
        if target_ext == 'avi':
            yield from ('-c:v', 'libxvid', '-vtag', 'XVID')

@ -583,7 +594,7 @@ class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP):

    @staticmethod
    def _options(target_ext):
-        return FFmpegPostProcessor.stream_copy_opts()
+        return FFmpegPostProcessor.stream_copy_opts(ext=target_ext)


 class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
@ -620,13 +631,18 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
        webm_vtt_warn = False
        mp4_ass_warn = False

+        json_subs = {}
+
        for lang, sub_info in subtitles.items():
            if not os.path.exists(sub_info.get('filepath', '')):
                self.report_warning(f'Skipping embedding {lang} subtitle because the file is missing')
                continue
            sub_ext = sub_info['ext']
            if sub_ext == 'json':
-                self.report_warning('JSON subtitles cannot be embedded')
+                if info['ext'] in ('mkv', 'mka'):
+                    json_subs[lang] = sub_info['filepath']
+                else:
+                    self.report_warning('JSON subtitles can only be embedded in mkv/mka files.')
            elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
                sub_langs.append(lang)
                sub_names.append(sub_info.get('name'))
@ -639,31 +655,48 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
                mp4_ass_warn = True
                self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues')

-        if not sub_langs:
+        if not sub_langs and not json_subs:
            return [], info

        input_files = [filename, *sub_filenames]

-        opts = [
-            *self.stream_copy_opts(ext=info['ext']),
-            # Don't copy the existing subtitles, we may be running the
-            # postprocessor a second time
+        opts = [*self.stream_copy_opts(ext=info['ext'])]
+
+        if sub_langs and sub_names:
+            # We have regular subtitles available to embed. Don't copy the
+            # existing subtitles, we may be running the postprocessor a second
+            # time.
+            opts.extend([
                '-map', '-0:s',
-        ]
+            ])
+
        for i, (lang, name) in enumerate(zip(sub_langs, sub_names)):
-            opts.extend(['-map', f'{i + 1}:0'])
            lang_code = ISO639Utils.short2long(lang) or lang
-            opts.extend([f'-metadata:s:s:{i}', f'language={lang_code}'])
+            opts.extend([
+                '-map', f'{i + 1}:0',
+                f'-metadata:s:s:{i}', f'language={lang_code}',
+            ])
+
            if name:
                opts.extend([f'-metadata:s:s:{i}', f'handler_name={name}',
                             f'-metadata:s:s:{i}', f'title={name}'])

+        for json_lang, json_filename in json_subs.items():
+            escaped_json_filename = self._ffmpeg_filename_argument(json_filename)
+            json_basename = os.path.basename(json_filename)
+            opts.extend([
+                '-map', f'-0:m:filename:{json_lang}.json?',
+                '-attach', escaped_json_filename,
+                f'-metadata:s:m:filename:{json_basename}', 'mimetype=application/json',
+                f'-metadata:s:m:filename:{json_basename}', f'filename={json_lang}.json',
+            ])
+
        temp_filename = prepend_extension(filename, 'temp')
        self.to_screen(f'Embedding subtitles in "{filename}"')
        self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
        os.replace(temp_filename, filename)

-        files_to_delete = [] if self._already_have_subtitle else sub_filenames
+        files_to_delete = [] if self._already_have_subtitle else [*sub_filenames, *json_subs.values()]
        return files_to_delete, info


@ -678,7 +711,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
    @staticmethod
    def _options(target_ext):
        audio_only = target_ext == 'm4a'
-        yield from FFmpegPostProcessor.stream_copy_opts(not audio_only)
+        yield from FFmpegPostProcessor.stream_copy_opts(not audio_only, ext=target_ext)
        if audio_only:
            yield from ('-vn', '-acodec', 'copy')

@ -806,15 +839,20 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
            write_json_file(self._downloader.sanitize_info(info, self.get_param('clean_infojson', True)), infofn)
            info['infojson_filename'] = infofn

-        old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json')
-        if old_stream is not None:
-            yield ('-map', f'-0:{old_stream}')
-            new_stream -= 1
+        escaped_name = self._ffmpeg_filename_argument(infofn)
+        info_basename = os.path.basename(infofn)

        yield (
-            '-attach', self._ffmpeg_filename_argument(infofn),
-            f'-metadata:s:{new_stream}', 'mimetype=application/json',
-            f'-metadata:s:{new_stream}', 'filename=info.json',
+            # In order to override any old info.json reliably we need to
+            # instruct FFmpeg to consider valid tracks without a codec id, like
+            # JSON attachments.
+            '-copy_unknown',
+            # This map operation allows us to actually replace any previous
+            # info.json data.
+            '-map', '-0:m:filename:info.json?',
+            '-attach', escaped_name,
+            f'-metadata:s:m:filename:{info_basename}', 'mimetype=application/json',
+            f'-metadata:s:m:filename:{info_basename}', 'filename=info.json',
        )


@ -873,7 +911,7 @@ class FFmpegFixupStretchedPP(FFmpegFixupPostProcessor):
        stretched_ratio = info.get('stretched_ratio')
        if stretched_ratio not in (None, 1):
            self._fixup('Fixing aspect ratio', info['filepath'], [
-                *self.stream_copy_opts(), '-aspect', f'{stretched_ratio:f}'])
+                *self.stream_copy_opts(ext=info['ext']), '-aspect', f'{stretched_ratio:f}'])
        return [], info


@ -881,7 +919,7 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor):
    @PostProcessor._restrict_to(images=False, video=False)
    def run(self, info):
        if info.get('container') == 'm4a_dash':
-            self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(), '-f', 'mp4'])
+            self._fixup('Correcting container', info['filepath'], [*self.stream_copy_opts(ext=info['ext']), '-f', 'mp4'])
        return [], info


@ -904,7 +942,7 @@ class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor):
            if self.get_audio_codec(info['filepath']) == 'aac':
                args.extend(['-bsf:a', 'aac_adtstoasc'])
            self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [
-                *self.stream_copy_opts(), *args])
+                *self.stream_copy_opts(ext=info['ext']), *args])
        return [], info


@ -925,7 +963,7 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor):
            opts = ['-vf', 'setpts=PTS-STARTPTS']
        else:
            opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS']
-        self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False), '-ss', self.trim])
+        self._fixup('Fixing frame timestamp', info['filepath'], [*opts, *self.stream_copy_opts(False, ext=info['ext']), '-ss', self.trim])
        return [], info


@ -934,7 +972,7 @@ class FFmpegCopyStreamPP(FFmpegFixupPostProcessor):

    @PostProcessor._restrict_to(images=False)
    def run(self, info):
-        self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts())
+        self._fixup(self.MESSAGE, info['filepath'], self.stream_copy_opts(ext=info['ext']))
        return [], info


@ -1063,7 +1101,7 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor):
        self.to_screen(f'Splitting video by chapters; {len(chapters)} chapters found')
        for idx, chapter in enumerate(chapters):
            destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info)
-            self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())])
+            self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts(ext=info['ext']))])
        if in_file != info['filepath']:
            self._delete_downloaded_files(in_file, msg=None)
        return [], info
Author	SHA1	Message	Date
Riteo	1b39c4c782	Merge `1cae3bf46d` into `f2a4983df7`	2024-11-13 02:33:12 +00:00
Jackson Humphrey	f2a4983df7	[ie/archive.org] Fix comments extraction (#11527 ) Closes #11526 Authored by: jshumphrey	2024-11-12 23:26:18 +00:00
bashonly	bacc31b05a	[ie/facebook] Fix formats extraction (#11513 ) Closes #11497 Authored by: bashonly	2024-11-12 23:23:10 +00:00
Riteo	1cae3bf46d	Use unpack operator for files to delete	2024-11-08 03:52:50 +01:00
Riteo	4aa3c401d4	Do not pass `-map -0:s` multiple times	2024-11-08 03:49:39 +01:00
Riteo	0cc0f3f086	Merge remote-tracking branch 'origin/master' into json-subtitles	2024-11-08 03:44:09 +01:00
Riteo	85a844aef3	Select copy mode depending on extension	2024-09-11 11:43:33 +02:00
Riteo	17781f9d7d	Remove debug thing I'm dumb	2024-09-08 13:33:24 +02:00
Riteo	fc349670c3	Fix info attachment in subpaths	2024-09-08 13:30:35 +02:00
Riteo	4b5be635b1	Add missing comma (again) oops	2024-09-08 13:30:35 +02:00
Riteo	45d1f2bb6c	Fix attachments in subpaths	2024-09-08 13:30:32 +02:00
Riteo	7fb0c05ff6	Revert format check stuff	2024-09-08 13:04:59 +02:00
Riteo	aaa25eb508	Add missing trailing comma	2024-08-14 03:18:55 +02:00
Riteo	780bfd044f	Pass target extension to all stream_copy_opts instances	2024-08-14 03:05:11 +02:00
Riteo	fe5de0005e	Add extra checks for non-matroska formats when copying	2024-08-14 02:55:33 +02:00
Riteo	9db000a9af	Check also if there are json subtitles	2024-08-14 02:55:29 +02:00
Riteo	62e274f515	Move regular subtitles options to their loop	2024-08-14 02:10:14 +02:00
Riteo	e202aae5d6	Remove redundant copy_unknown	2024-08-14 02:03:09 +02:00
Riteo	3b8050da5b	Merge remote-tracking branch 'origin/master' into json-subtitles	2024-08-14 02:02:56 +02:00
Riteo	38a9f70044	Use a map for JSON sub handling instead of two lists	2024-08-14 01:16:15 +02:00
Riteo	550b3a046a	Use the -copy_unknown flag in the stream copy otions Also split the yield expression as the comment above was a bit misleading (it was only related to the `-dn` flag).	2024-08-13 22:30:08 +02:00
Riteo	ba3a7232f0	[pp/FFmpegEmbedSubtitle] Embed JSON subtitles as Matroska attachments Since we can't embed them as regular subtitles (due to them not having any consistent structure), we embed them as file attachments, if exporting as Matroska. This allows us to have single-file downloads with everything embedded for e.g. archival purposes.	2024-06-14 16:56:54 +02:00
Riteo	339828d777	[pp/FFmpegMetadata] Use metadata stream specifier for info.json The old stream index specifiers would indiscriminately select any JSON attachment, which made stuff like embedding live chat json data risky if not impossible. Also adds `-copy_unknown` as JSON data is "unknown" according to FFmpeg (since it has no codec id) and thus would otherwise be rejected by default.	2024-06-14 16:56:52 +02:00