[XHamster] Add channel extraction

[XHamster] Add extraction of user's favorites
[XHamster] Move domain list to base class and introduce classpropinit
2024-11-12 18:21:22 +01:00 · 2023-10-04 01:56:22 +01:00 · 2023-10-04 01:56:22 +01:00 · 2023-10-04 01:56:21 +01:00 · 2023-10-04 01:56:21 +01:00 · 2023-10-04 01:07:50 +01:00
4 changed files with 225 additions and 90 deletions
--- a/test/test_download.py
+++ b/test/test_download.py
@ -123,8 +123,10 @@ def generator(test_case, tname):
        params['outtmpl'] = tname + '_' + params['outtmpl']
        if is_playlist and 'playlist' not in test_case:
            params.setdefault('extract_flat', 'in_playlist')
-            if traverse_obj(test_case, 'playlist_count', 'playlist_maxcount', default=-1) < 0:
-                params.setdefault('playlistend', test_case.get('playlist_mincount'))
+            # only process enough items for specified tests
+            pl_counts = traverse_obj(test_case, (None, ('playlist_count', 'playlist_mincount', 'playlist_maxcount')))
+            if pl_counts:
+                params.setdefault('playlistend', max(pl_counts) + 1)
            params.setdefault('skip_download', True)

        ydl = YoutubeDL(params, auto_init=False)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1603,6 +1603,7 @@ from .xhamster import (
    XHamsterIE,
    XHamsterEmbedIE,
    XHamsterCategoryIE,
+    XHamsterChannelIE,
    XHamsterCreatorIE,
    XHamsterSearchIE,
    XHamsterSearchKeyIE,
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@ -12,6 +12,7 @@ from ..compat import (
    compat_urlparse,
 )
 from ..utils import (
+    classpropinit,
    clean_html,
    determine_ext,
    extract_attributes,
@ -22,6 +23,7 @@ from ..utils import (
    merge_dicts,
    parse_duration,
    parse_qs,
+    remove_start,
    T,
    traverse_obj,
    txt_or_none,
@ -32,6 +34,18 @@ from ..utils import (


 class XHamsterBaseIE(InfoExtractor):
+    # base domains that don't redirect to xhamster.com (not xhday\d\.com, eg)
+    _DOMAINS = '(?:%s)' % '|'.join((
+        r'xhamster\d*\.(?:com|desi)',
+        r'xhamster\.one',
+        r'xhms\.pro',
+        r'xh(?:open|access|victory|big|channel)\.com',
+        r'(?:full|mega)xh\.com',
+        r'xh(?:vid|official|planet)\d*\.com',
+        # requires Tor
+        r'xhamster[a-z2-7]+\.onion',
+    ))
+
    def _download_webpage_handle(self, url, video_id, *args, **kwargs):
        # note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None)
        # default UA to 'Mozilla' (only) to avoid interstitial page
@ -53,25 +67,16 @@ class XHamsterBaseIE(InfoExtractor):


 class XHamsterIE(XHamsterBaseIE):
-    # base domains that don't redirect to xhamster.com (not xhday\d\.com, eg)
-    _DOMAINS = '(?:%s)' % '|'.join((
-        r'xhamster\d*\.(?:com|desi)',
-        r'xhamster\.one',
-        r'xhms\.pro',
-        r'xh(?:open|access|victory|big|channel)\.com',
-        r'(?:full|mega)xh\.com',
-        r'xh(?:vid|official|planet)\d*\.com',
-        # requires Tor
-        r'xhamster[a-z2-7]+\.onion',
-    ))
-    _VALID_URL = r'''(?x)
-                    https?://
-                        (?:.+?\.)?%s/
-                        (?:
-                            movies/(?P<id>[\dA-Za-z]+)/(?P<display_id>[^/]*)\.html|
-                            videos/(?P<display_id_2>[^/]*)-(?P<id_2>[\dA-Za-z]+)
-                        )
-                    ''' % _DOMAINS
+    _VALID_URL = classpropinit(
+        lambda cls:
+            r'''(?x)
+                https?://
+                    (?:.+?\.)?%s/
+                    (?:
+                        movies/(?P<id>[\dA-Za-z]+)/(?P<display_id>[^/]*)\.html|
+                        videos/(?P<display_id_2>[^/]*)-(?P<id_2>[\dA-Za-z]+)
+                    )
+            ''' % cls._DOMAINS)
    _TESTS = [{
        'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
        'md5': '34e1ab926db5dc2750fed9e1f34304bb',
@ -379,7 +384,9 @@ class XHamsterIE(XHamsterBaseIE):


 class XHamsterEmbedIE(XHamsterBaseIE):
-    _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % XHamsterIE._DOMAINS
+    _VALID_URL = classpropinit(
+        lambda cls:
+        r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % cls._DOMAINS)
    _TEST = {
        'url': 'http://xhamster.com/xembed.php?video=3328539',
        'info_dict': {
@ -421,9 +428,19 @@ class XHamsterEmbedIE(XHamsterBaseIE):

 class XHamsterPlaylistIE(XHamsterBaseIE):
    _NEXT_PAGE_RE = r'(<a\b[^>]+\bdata-page\s*=\s*["\']next[^>]+>)'
+    _VALID_URL_TPL = r'''(?x)
+        https?://(?:.+?\.)?%s
+                /%s/(?P<id>[^/?#]+)
+                (?:(?P<sub>(?:/%s)+))?
+                (?:/(?P<pnum>\d+))?(?:[/?#]|$)
+    '''

-    def _page_url(self, user_id, page_num, url=None):
-        return self._PAGE_URL_TPL % (user_id, page_num)
+    def _page_url(self, user_id, subs, page_num, url):
+        n_url = self._PAGE_URL_TPL % (
+            join_nonempty(user_id, *subs, delim='/'), page_num)
+        n_url = compat_urlparse.urlsplit(n_url)
+        url = compat_urlparse.urlsplit(url)
+        return compat_urlparse.urlunsplit(n_url[:3] + url[3:])

    def _extract_entries(self, page, user_id):
        for video_tag_match in re.finditer(
@ -442,9 +459,9 @@ class XHamsterPlaylistIE(XHamsterBaseIE):
            self._search_regex(self._NEXT_PAGE_RE, page, 'next page', default=None),
            (T(extract_attributes), 'href', T(url_or_none)))

-    def _entries(self, user_id, page_num=None, page=None, url=None):
+    def _entries(self, user_id, subs, page_num=None, page=None, url=None):
        page_1 = 1 if page_num is None else page_num
-        next_page_url = self._page_url(user_id, page_1, url)
+        next_page_url = self._page_url(user_id, subs, page_1, url)
        for pagenum in itertools.count(page_1):
            if not page:
                page = self._download_webpage(
@ -463,34 +480,28 @@ class XHamsterPlaylistIE(XHamsterBaseIE):
                break
            page = None

-    def _fancy_page_url(self, user_id, page_num, url):
-        sub = self._match_valid_url(url).group('sub')
-        n_url = self._PAGE_URL_TPL % (
-            join_nonempty(user_id, sub, delim='/'), page_num)
-        return compat_urlparse.urljoin(n_url, url)
-
-    def _fancy_get_title(self, user_id, page_num, url):
-        sub = self._match_valid_url(url).group('sub')
-        sub = (sub or '').split('/')
-        sub.extend((compat_urlparse.urlsplit(url).query or '').split('&'))
-        sub.append('all' if page_num is None else ('p%d' % page_num))
-        return '%s (%s)' % (user_id, join_nonempty(*sub, delim=','))
-
    @staticmethod
-    def _get_title(user_id, page_num, url=None):
-        return '%s (%s)' % (user_id, 'all' if page_num is None else ('p%d' % page_num))
+    def _get_title(user_id, subs, page_num, url):
+        subs = subs[:]
+        if url:
+            subs.extend((compat_urlparse.urlsplit(url).query or '').split('&'))
+        subs.append('all' if page_num is None else ('p%d' % page_num))
+        return '%s (%s)' % (user_id, join_nonempty(*subs, delim=','))

    def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        user_id = mobj.group('id')
-        page_num = int_or_none(mobj.groupdict().get('pnum'))
+        mobj = self._match_valid_url(url).groupdict()
+        user_id = mobj['id']
+        page_num = int_or_none(mobj.get('pnum'))
+        subs = remove_start(mobj.get('sub') or '', '/').split('/')
        return self.playlist_result(
-            self._entries(user_id, page_num, url=url), user_id,
-            self._get_title(user_id, page_num, url=url))
+            self._entries(user_id, subs, page_num, url=url), user_id,
+            self._get_title(user_id, subs, page_num, url=url))


 class XHamsterUserIE(XHamsterPlaylistIE):
-    _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P<id>[^/?#&]+)(?:/videos/(?P<pnum>\d+))?' % XHamsterIE._DOMAINS
+    _VALID_URL = classpropinit(
+        lambda cls:
+        r'https?://(?:.+?\.)?%s/users/(?P<id>[^/?#&]+)(?P<sub>/favorites)?(?:/videos/(?P<pnum>\d+))?' % cls._DOMAINS)
    _PAGE_URL_TPL = 'https://xhamster.com/users/%s/videos/%s'
    _TESTS = [{
        # Paginated user profile
@ -513,10 +524,27 @@ class XHamsterUserIE(XHamsterPlaylistIE):
        'url': 'https://xhamster.com/users/firatkaan/videos',
        'info_dict': {
            'id': 'firatkaan',
+            'title': 'firatkaan (all)',
        },
        'playlist_mincount': 1,
    }, {
-        # the below doesn't match but is redirected via generic
+        # User with `favorites`
+        'url': 'https://xhamster.com/users/cubafidel/videos/',
+        'info_dict': {
+            'id': 'cubafidel',
+            'title': 'cubafidel (all)',
+        },
+        'playlist_maxcount': 300,
+    }, {
+        # Faves of user with `favorites`
+        'url': 'https://xhamster.com/users/cubafidel/favorites/videos/',
+        'info_dict': {
+            'id': 'cubafidel',
+            'title': 'cubafidel (favorites,all)',
+        },
+        'playlist_mincount': 400,
+    }, {
+        # below URL doesn't match but is redirected via generic
        # 'url': 'https://xhday.com/users/mobhunter',
        'url': 'https://xhvid.com/users/pelushe21',
        'only_matching': True,
@ -525,13 +553,13 @@ class XHamsterUserIE(XHamsterPlaylistIE):

 class XHamsterCreatorIE(XHamsterPlaylistIE):
    # `pornstars`, `celebrities` and `creators` share the same namespace
-    _VALID_URL = r'''(?x)
-        https?://(?:.+?\.)?%s
-                /(?:(?:gay|shemale)/)?(?:creators|pornstars|celebrities)
-                /(?P<id>[^/?#]+)
-                (?:(?P<sub>(?:/(?:hd|4k|newest|full-length|exclusive))+))?
-                (?:/(?P<pnum>\d+))?(?:[/?#]|$)
-    ''' % XHamsterIE._DOMAINS
+    _VALID_URL = classpropinit(
+        lambda cls:
+        cls._VALID_URL_TPL % (
+            cls._DOMAINS,
+            '(?:(?:gay|shemale)/)?(?:creators|pornstars|celebrities)',
+            r'(?:hd|4k|newest|full-length|exclusive|best(?:/(?:weekly|monthly|year-\d{4}))?)',
+        ))
    _PAGE_URL_TPL = 'https://xhamster.com/creators/%s/%s'
    _TESTS = [{
        # Paginated creator profile
@ -569,24 +597,66 @@ class XHamsterCreatorIE(XHamsterPlaylistIE):
        'playlist_maxcount': 30,
    }]

-    def _page_url(self, user_id, page_num, url):
-        return self._fancy_page_url(user_id, page_num, url)

-    def _get_title(self, user_id, page_num, url):
-        return self._fancy_get_title(user_id, page_num, url)
-
-
-class XHamsterCategoryIE(XHamsterPlaylistIE):
-    # `tags` and `categories` share the same namespace
-    _VALID_URL = r'''(?x)
-        https?://(?:.+?\.)?%s
-                (?:(?P<queer>gay|shemale)/)?(?:/categories|/tags|(?=/hd))
-                /(?P<id>[^/?#]+)
-                (?P<sub>(?:/(?:hd|4k|producer|creator|best(?:/(?:weekly|monthly|year-\d{4}))?))+)?
-                (?:/(?P<pnum>\d+))?(?:[/?#]|$)
-    ''' % XHamsterIE._DOMAINS
-    _PAGE_URL_TPL = 'https://xhamster.com/categories/%s/%s'
+class XHamsterChannelBaseIE(XHamsterPlaylistIE):
    _NEXT_PAGE_RE = r'(<a\b[^>]+\bclass\s*=\s*("|\')(?:[\w-]+\s+)*?prev-next-list-link--next(?:\s+[\w-]+)*\2[^>]+>)'
+
+
+class XHamsterChannelIE(XHamsterChannelBaseIE):
+    _VALID_URL = classpropinit(
+        lambda cls:
+        cls._VALID_URL_TPL % (
+            cls._DOMAINS,
+            '(?:(?:gay|shemale)/)?channels',
+            r'(?:hd|4k|newest|full-length|best(?:/(?:weekly|monthly|year-\d{4}))?)',
+        ))
+    _PAGE_URL_TPL = 'https://xhamster.com/channels/%s/%s'
+    _TESTS = [{
+        # Paginated channel
+        'url': 'https://xhamster.com/channels/freeuse-fantasy',
+        'info_dict': {
+            'id': 'freeuse-fantasy',
+            'title': 'freeuse-fantasy (all)',
+        },
+        'playlist_mincount': 90,
+    }, {
+        # Non-paginated channel (for now?)
+        'url': 'https://xhamster.com/channels/oopsie',
+        'info_dict': {
+            'id': 'oopsie',
+            'title': 'oopsie (all)',
+        },
+        'playlist_mincount': 30,
+        'playlist_maxcount': 48,
+    }, {
+        # Channel filtered by path
+        'url': 'https://xhamster.com/channels/freeuse-fantasy/best/year-2022',
+        'info_dict': {
+            'id': 'freeuse-fantasy',
+            'title': 'freeuse-fantasy (best,year-2022,all)',
+        },
+        'playlist_count': 30,
+    }, {
+        # Channel filtered by query
+        'url': 'https://xhamster.com/channels/freeuse-fantasy?min-duration=40',
+        'info_dict': {
+            'id': 'freeuse-fantasy',
+            'title': 'freeuse-fantasy (min-duration=40,all)',
+        },
+        'playlist_maxcount': 10,
+    }]
+
+
+class XHamsterCategoryIE(XHamsterChannelBaseIE):
+    # `tags` and `categories` share the same namespace
+    _VALID_URL = classpropinit(
+        lambda cls:
+        cls._VALID_URL_TPL % (
+            cls._DOMAINS,
+            '(?:(?P<queer>gay|shemale)/)?(?:categories|tags|(?=hd))',
+            r'(?:hd|4k|producer|creator|best(?:/(?:weekly|monthly|year-\d{4}))?)',
+        ))
+    _PAGE_URL_TPL = 'https://xhamster.com/categories/%s/%s'
    _TESTS = [{
        # Paginated category/tag
        'url': 'https://xhamster.com/tags/hawaiian',
@ -624,26 +694,28 @@ class XHamsterCategoryIE(XHamsterPlaylistIE):
        'playlist_maxcount': 20,
    }]

-    def _page_url(self, user_id, page_num, url):
-        queer, sub = self._match_valid_url(url).group('queer', 'sub')
+    def _page_url(self, user_id, subs, page_num, url):
+        queer = self._match_valid_url(url).group('queer')
        n_url = self._PAGE_URL_TPL % (
-            join_nonempty(queer, user_id, sub, delim='/'), page_num)
+            join_nonempty(queer, user_id, *subs, delim='/'), page_num)
        return compat_urlparse.urljoin(n_url, url)

-    def _get_title(self, user_id, page_num, url):
-        queer, sub = self._match_valid_url(url).group('queer', 'sub')
-        queer = [] if queer is None else [queer]
-        sub = queer + (sub or '').split('/')
-        sub.extend((compat_urlparse.urlsplit(url).query or '').split('&'))
-        sub.append('all' if page_num is None else ('p%d' % page_num))
-        return '%s (%s)' % (user_id, join_nonempty(*sub, delim=','))
+    def _get_title(self, user_id, subs, page_num, url):
+        queer = self._match_valid_url(url).group('queer')
+        if queer:
+            subs = [queer] + subs
+        subs.extend((compat_urlparse.urlsplit(url).query or '').split('&'))
+        subs.append('all' if page_num is None else ('p%d' % page_num))
+        return '%s (%s)' % (user_id, join_nonempty(*subs, delim=','))


 class XHamsterSearchIE(XHamsterPlaylistIE):
-    _VALID_URL = r'''(?x)
-        https?://(?:.+?\.)?%s
-                /search/(?P<id>[^/?#]+)
-    ''' % XHamsterIE._DOMAINS
+    _VALID_URL = classpropinit(
+        lambda cls:
+        r'''(?x)
+            https?://(?:.+?\.)?%s
+                    /search/(?P<id>[^/?#]+)
+        ''' % cls._DOMAINS)
    _TESTS = [{
        # Single page result
        'url': 'https://xhamster.com/search/latvia',
@ -672,20 +744,20 @@ class XHamsterSearchIE(XHamsterPlaylistIE):
    }]

    @staticmethod
-    def _page_url(user_id, page_num, url):
+    def _page_url(user_id, subs, page_num, url):
        return url

-    def _get_title(self, user_id, page_num, url=None):
+    def _get_title(self, user_id, subs, page_num, url=None):
        return super(XHamsterSearchIE, self)._get_title(
-            user_id.replace('+', ' '), page_num, url)
+            user_id.replace('+', ' '), [], page_num, url)

    def _real_extract(self, url):
        user_id = self._match_id(url)
        page_num = traverse_obj(url, (
            T(parse_qs), 'page', -1, T(int_or_none)))
        return self.playlist_result(
-            self._entries(user_id, page_num, url=url), user_id,
-            self._get_title(user_id, page_num))
+            self._entries(user_id, None, page_num, url=url), user_id,
+            self._get_title(user_id, None, page_num))


 class XHamsterSearchKeyIE(SearchInfoExtractor, XHamsterSearchIE):
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -6510,3 +6510,63 @@ def join_nonempty(*values, **kwargs):
    if from_dict is not None:
        values = (traverse_obj(from_dict, variadic(v)) for v in values)
    return delim.join(map(compat_str, filter(None, values)))
+
+
+# from yt-dlp
+class classproperty(object):
+    """property access for class methods with optional caching"""
+    def __new__(cls, *args, **kwargs):
+        if 'func' in kwargs:
+            func = kwargs.pop('func')
+        elif len(args) > 0:
+            func = args[0]
+            args = args[1:]
+        else:
+            func = None
+        if not func:
+            return functools.partial(cls, *args, **kwargs)
+        return super(classproperty, cls).__new__(cls)
+
+    def __init__(self, func, **kwargs):
+        # kw-only arg
+        cache = kwargs.get('cache', False)
+        functools.update_wrapper(self, func)
+        self.func = func
+        self._cache = {} if cache else None
+
+    def __get__(self, n, cls):
+        if self._cache is None:
+            return self.func(cls)
+        elif cls not in self._cache:
+            self._cache[cls] = self.func(cls)
+        return self._cache[cls]
+
+
+class classpropinit(classproperty):
+    """ A Python fubar: parent class vars are not in scope when the
+        `class suite` is evaluated, so disallowing `childvar = fn(parentvar)`.
+        Instead, the parent class has to be mentioned redundantly and
+        unmaintainably, since the current class isn't yet bound.
+        This decorator evaluates a class method and assigns its result
+        in place of the method.
+
+        class child(parent):
+            # before
+            childvar = fn(parent.parentvar)
+            # now
+            @classpropinit
+            def childvar(cls):
+                return fn(cls.parentvar)
+            # or
+            childvar = classpropinit(lambda cls: fn(cls.parentvar))
+    """
+
+    def __init__(self, func):
+        functools.update_wrapper(self, func)
+        self.name = func.__name__
+        self.func = func
+
+    def __get__(self, _, cls):
+        val = self.func(cls)
+        setattr(cls, self.name, val)
+        return val
Author	SHA1	Message	Date
dirkf	b2b622a9b5	[XHamster] Add channel extraction	2023-10-04 01:56:22 +01:00
dirkf	71aae1d795	[XHamster] Add extraction of user's `favorites`	2023-10-04 01:56:22 +01:00
dirkf	44a30c6d3a	[XHamster] Move domain list to base class and introduce classpropinit	2023-10-04 01:56:21 +01:00
dirkf	d0762cf36a	[utils] Add `classpropinit()` decorator for easier use of inherited class vars	2023-10-04 01:56:21 +01:00
dirkf	e6c95bd192	[utils] Add `classproperty()` decorator from yt-dlp	2023-10-04 01:07:50 +01:00
dirkf	3a31e52d27	[test] pl_counts	2023-10-04 00:59:11 +01:00