[ie/xhamster:user] Support creator urls (#8232 )

Authored by: Grub4K
[ie/youtube] Raise a warning for Incomplete Data instead of an error (#8238 )
2024-11-12 18:31:25 +01:00 · 2023-10-03 11:33:40 +02:00 · 2023-10-03 06:42:30 +00:00
3 changed files with 32 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -1809,6 +1809,7 @@ The following extractors use this feature:
 * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8)
 * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others
 * `innertube_key`: Innertube API key to use for all API requests
+* `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning

 #### youtubetab (YouTube playlists, channels, feeds, etc.)
 * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)
--- a/yt_dlp/extractor/xhamster.py
+++ b/yt_dlp/extractor/xhamster.py
@ -407,7 +407,7 @@ class XHamsterEmbedIE(InfoExtractor):


 class XHamsterUserIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P<id>[^/?#&]+)' % XHamsterIE._DOMAINS
+    _VALID_URL = rf'https?://(?:[^/?#]+\.)?{XHamsterIE._DOMAINS}/(?:(?P<user>users)|creators)/(?P<id>[^/?#&]+)'
    _TESTS = [{
        # Paginated user profile
        'url': 'https://xhamster.com/users/netvideogirls/videos',
@ -422,6 +422,12 @@ class XHamsterUserIE(InfoExtractor):
            'id': 'firatkaan',
        },
        'playlist_mincount': 1,
+    }, {
+        'url': 'https://xhamster.com/creators/squirt-orgasm-69',
+        'info_dict': {
+            'id': 'squirt-orgasm-69',
+        },
+        'playlist_mincount': 150,
    }, {
        'url': 'https://xhday.com/users/mobhunter',
        'only_matching': True,
@ -430,8 +436,9 @@ class XHamsterUserIE(InfoExtractor):
        'only_matching': True,
    }]

-    def _entries(self, user_id):
-        next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id
+    def _entries(self, user_id, is_user):
+        prefix, suffix = ('users', 'videos') if is_user else ('creators', 'exclusive')
+        next_page_url = f'https://xhamster.com/{prefix}/{user_id}/{suffix}/1'
        for pagenum in itertools.count(1):
            page = self._download_webpage(
                next_page_url, user_id, 'Downloading page %s' % pagenum)
@ -454,5 +461,5 @@ class XHamsterUserIE(InfoExtractor):
                break

    def _real_extract(self, url):
-        user_id = self._match_id(url)
-        return self.playlist_result(self._entries(user_id), user_id)
+        user, user_id = self._match_valid_url(url).group('user', 'id')
+        return self.playlist_result(self._entries(user_id, bool(user)), user_id)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@ -941,7 +941,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
    def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
                          ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
                          default_client='web'):
-        for retry in self.RetryManager():
+        raise_for_incomplete = bool(self._configuration_arg('raise_incomplete_data', ie_key=YoutubeIE))
+        # Incomplete Data should be a warning by default when retries are exhausted, while other errors should be fatal.
+        icd_retries = iter(self.RetryManager(fatal=raise_for_incomplete))
+        icd_rm = next(icd_retries)
+        main_retries = iter(self.RetryManager())
+        main_rm = next(main_retries)
+        for _ in range(main_rm.retries + icd_rm.retries + 1):
            try:
                response = self._call_api(
                    ep=ep, fatal=True, headers=headers,
@ -953,7 +959,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                if not isinstance(e.cause, network_exceptions):
                    return self._error_or_warning(e, fatal=fatal)
                elif not isinstance(e.cause, HTTPError):
-                    retry.error = e
+                    main_rm.error = e
+                    next(main_retries)
                    continue

                first_bytes = e.cause.response.read(512)
@ -965,27 +972,32 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                    if yt_error:
                        self._report_alerts([('ERROR', yt_error)], fatal=False)
                # Downloading page may result in intermittent 5xx HTTP error
-                # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
+                # Sometimes a 404 is also received. See: https://github.com/ytdl-org/youtube-dl/issues/28289
                # We also want to catch all other network exceptions since errors in later pages can be troublesome
                # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
                if e.cause.status not in (403, 429):
-                    retry.error = e
+                    main_rm.error = e
+                    next(main_retries)
                    continue
                return self._error_or_warning(e, fatal=fatal)

            try:
                self._extract_and_report_alerts(response, only_once=True)
            except ExtractorError as e:
-                # YouTube servers may return errors we want to retry on in a 200 OK response
+                # YouTube's servers may return errors we want to retry on in a 200 OK response
                # See: https://github.com/yt-dlp/yt-dlp/issues/839
                if 'unknown error' in e.msg.lower():
-                    retry.error = e
+                    main_rm.error = e
+                    next(main_retries)
                    continue
                return self._error_or_warning(e, fatal=fatal)
            # Youtube sometimes sends incomplete data
            # See: https://github.com/ytdl-org/youtube-dl/issues/28194
            if not traverse_obj(response, *variadic(check_get_keys)):
-                retry.error = ExtractorError('Incomplete data received', expected=True)
+                icd_rm.error = ExtractorError('Incomplete data received', expected=True)
+                should_retry = next(icd_retries, None)
+                if not should_retry:
+                    return None
                continue

            return response
Author	SHA1	Message	Date
Simon Sawicki	cc8d844152	[ie/xhamster:user] Support creator urls (#8232 ) Authored by: Grub4K	2023-10-03 11:33:40 +02:00
coletdjnz	eb5bdbfa70	[ie/youtube] Raise a warning for `Incomplete Data` instead of an error (#8238 ) Closes https://github.com/yt-dlp/yt-dlp/issues/8206 Adds `raise_incomplete_data` extractor arg to revert this behaviour and raise an error. Authored by: coletdjnz Co-authored-by: Simon Sawicki <contact@grub4k.xyz>	2023-10-03 06:42:30 +00:00