Cleanup imports

Extract proper id for pages
Don't abort on no login
2024-11-29 18:51:24 +01:00 · 2024-01-18 06:31:43 +01:00 · 2024-01-18 06:31:19 +01:00 · 2024-01-18 06:30:51 +01:00 · 2024-01-18 06:29:50 +01:00 · 2024-01-18 06:28:58 +01:00
3 changed files with 157 additions and 66 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -42,7 +42,6 @@ from .abematv import (
    AbemaTVTitleIE,
 )
 from .academicearth import AcademicEarthCourseIE
 from .academymel import AcademyMelIE
 from .acast import (
    ACastIE,
    ACastChannelIE,
@ -681,7 +680,10 @@ from .genius import (
    GeniusIE,
    GeniusLyricsIE,
 )
-from .getcourseru import GetCourseRuIE
+from .getcourseru import (
    GetCourseRuPlayerIE,
    GetCourseRuIE
 )
 from .gettr import (
    GettrIE,
    GettrStreamingIE,
--- a/yt_dlp/extractor/academymel.py
+++ b/yt_dlp/extractor/academymel.py
@ -1,57 +0,0 @@
 import re
 import time
 from .common import InfoExtractor
 from .getcourseru import GetCourseRuIE
 from ..utils import update_url_query, urlencode_postdata
 class AcademyMelIE(InfoExtractor):
    _NETRC_MACHINE = 'academymel'
    _VALID_URL = r'https?://academymel\.online/(?P<id>[^/?#]+)'
    _LOGIN_URL = 'https://academymel.online/cms/system/login'
    _TESTS = [{
        'url': 'http://academymel.online/3video_1',
        'info_dict': {
            'id': '3video_1',
            'title': 'Промоуроки Академии МЕЛ',
        },
        'playlist_count': 1,
        'playlist': [{
            'info_dict': {
                'id': '4885302',
                'ext': 'mp4',
                'title': 'Промоуроки Академии МЕЛ',
                'duration': 1693
            },
        }]
    }]
    def _perform_login(self, username, password):
        self._request_webpage(
            self._LOGIN_URL, None, 'Logging in', 'Failed to log in',
            data=urlencode_postdata({
                'action': 'processXdget',
                'xdgetId': 'r6335_1_1',
                'params[action]': 'login',
                'params[url]': update_url_query(self._LOGIN_URL, {'required': 'true'}),
                'params[object_type]': 'cms_page',
                'params[object_id]': -1,
                'params[email]': username,
                'params[password]': password,
                'requestTime': int(time.time())
            }))
    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        if not self._get_cookies(self._LOGIN_URL).get('PHPSESSID5'):
            self.raise_login_required()
        webpage = self._download_webpage(url, playlist_id)
        title = self._html_extract_title(webpage)
        return self.playlist_from_matches(
            re.findall(r'data-iframe-src="(https?://[^."]+\.getcourse\.ru/sign-player/[^"]+)', webpage),
            playlist_id, title, ie=GetCourseRuIE, video_kwargs={
                'url_transparent': True,
                'title': title,
            })
--- a/yt_dlp/extractor/getcourseru.py
+++ b/yt_dlp/extractor/getcourseru.py
@ -1,35 +1,181 @@
 import re
 import time
 import urllib.parse
 from .common import InfoExtractor
-from ..utils import int_or_none, traverse_obj, url_or_none
+from ..utils import int_or_none, url_or_none, urlencode_postdata
 from ..utils.traversal import traverse_obj
-class GetCourseRuIE(InfoExtractor):
+class GetCourseRuPlayerIE(InfoExtractor):
-    _VALID_URL = r'https?://[^.]+\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=(?P<id>[^#&]+)'
+    _VALID_URL = r'https?://player02\.getcourse\.ru/sign-player/?\?(?:[^#]+&)?json=[^#&]+'
    _TESTS = [{
-        'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1MjcwMzU0LCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=031d44cc738c58863a436d98f1032132&vh-static-feature=zigzag',
+        'url': 'http://player02.getcourse.ru/sign-player/?json=eyJ2aWRlb19oYXNoIjoiMTkwYmRmOTNmMWIyOTczNTMwOTg1M2E3YTE5ZTI0YjMiLCJ1c2VyX2lkIjozNTk1MjUxODMsInN1Yl9sb2dpbl91c2VyX2lkIjpudWxsLCJsZXNzb25faWQiOm51bGwsImlwIjoiNDYuMTQyLjE4Mi4yNDciLCJnY19ob3N0IjoiYWNhZGVteW1lbC5vbmxpbmUiLCJ0aW1lIjoxNzA1NDQ5NjQyLCJwYXlsb2FkIjoidV8zNTk1MjUxODMiLCJ1aV9sYW5ndWFnZSI6InJ1IiwiaXNfaGF2ZV9jdXN0b21fc3R5bGUiOnRydWV9&s=354ad2c993d95d5ac629e3133d6cefea&vh-static-feature=zigzag',
        'info_dict': {
-            'id': '4885302',
+            'id': '513573381',
            'title': '190bdf93f1b29735309853a7a19e24b3',
            'ext': 'mp4',
            'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80',
            'duration': 1693
        },
        'skip': 'JWT expired',
    }]
    _EMBED_REGEX = [rf'(?x)<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL}[^\'"]*)']
    def _real_extract(self, url):
        webpage = self._download_webpage(url, None, 'Downloading player page')
        window_configs = self._search_json(
            r'window\.configs\s*=', webpage, 'config', None)
-        video_id = str(window_configs['videoId'])
+        video_id = str(window_configs['gcFileId'])
        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
            window_configs['masterPlaylistUrl'], video_id)
        return {
            **traverse_obj(window_configs, {
                'title': ('videoHash', {str}),
-                'thumbnail': ('thumbnailUrl', {url_or_none}),
+                'thumbnail': ('previewUrl', {url_or_none}),
                'duration': ('videoDuration', {int_or_none}),
            }),
            'id': video_id,
            'formats': formats,
            'subtitles': subtitles
        }
 class GetCourseRuIE(InfoExtractor):
    _NETRC_MACHINE = 'getcourseru'
    _LOGIN_URL_PATH = '/cms/system/login'
    _TESTS = [{
        'url': 'http://academymel.online/3video_1',
        'info_dict': {
            'id': '3059742',
            'display_id': '3video_1',
            'title': 'Промоуроки Академии МЕЛ',
        },
        'playlist_count': 1,
        'playlist': [{
            'info_dict': {
                'id': '513573381',
                'ext': 'mp4',
                'title': 'Промоуроки Академии МЕЛ',
                'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80',
                'duration': 1693
            },
        }]
    }, {
        'url': 'https://academymel.getcourse.ru/3video_1',
        'info_dict': {
            'id': '3059742',
            'display_id': '3video_1',
            'title': 'Промоуроки Академии МЕЛ',
        },
        'playlist_count': 1,
        'playlist': [{
            'info_dict': {
                'id': '513573381',
                'ext': 'mp4',
                'title': 'Промоуроки Академии МЕЛ',
                'thumbnail': 'https://preview-htz.kinescopecdn.net/preview/190bdf93f1b29735309853a7a19e24b3/preview.jpg?version=1702370546&host=vh-80',
                'duration': 1693
            },
        }]
    }, {
        'url': 'https://academymel.getcourse.ru/pl/teach/control/lesson/view?id=319141781&editMode=0',
        'info_dict': {
            'id': '319141781',
            'title': '1. Разминка у стены',
        },
        'playlist_count': 1,
        'playlist': [{
            'info_dict': {
                'id': '4919601',
                'ext': 'mp4',
                'title': '1. Разминка у стены',
                'thumbnail': 'https://preview-htz.vhcdn.com/preview/5a521788e7dc25b4f70c3dff6512d90e/preview.jpg?version=1703223532&host=vh-81',
                'duration': 704
            },
        }],
        'skip': 'paid lesson'
    }, {
        'url': 'https://manibeauty.getcourse.ru/pl/teach/control/lesson/view?id=272499894',
        'info_dict': {
            'id': '272499894',
            'title': 'Мотивация к тренировкам',
        },
        'playlist_count': 1,
        'playlist': [{
            'info_dict': {
                'id': '4242723',
                'ext': 'mp4',
                'title': 'Мотивация к тренировкам',
                'thumbnail': 'https://preview-htz.vhcdn.com/preview/70ed5b9f489dd03b4aff55bfdff71a26/preview.jpg?version=1685115787&host=vh-71',
                'duration': 30
            },
        }],
        'skip': 'paid lesson'
    }, {
        'url': 'https://gaismasmandalas.getcourse.io/ATLAUTSEVBUT',
        'only_matching': True,
    }]
    _DOMAINS = [
        'academymel.online',
        'marafon.mani-beauty.com',
    ]
    _BASE_URL_RE = rf'https?://(?:(?!player02\.)[^.]+\.getcourse\.(?:ru|io)|{"|".join(map(re.escape, _DOMAINS))})'
    _VALID_URL = [
        rf'{_BASE_URL_RE}/(?!pl/|teach/)(?P<id>[^?#]+)',
        rf'{_BASE_URL_RE}/(:?pl/)?teach/control/lesson/view\?(?:[^#]+&)?id=(?P<id>\d+)',
    ]
    def _login(self, url, username, password):
        if self._get_cookies(url).get('PHPSESSID5'):
            return
        domain = urllib.parse.urlparse(url).netloc
        login_url = f'https://{domain}{self._LOGIN_URL_PATH}'
        webpage = self._download_webpage(login_url, None)
        xdget_id = self._html_search_regex(
            r'<form[^>]*class="[^"]*state-login[^"]*"[^>]*data-xdget-id="([^"]+)"',
            webpage, 'xdgetId')
        simple_sign = self._html_search_regex(
            r'window.requestSimpleSign\s*=\s*"([\da-f]+)"',
            webpage, 'simple sign')
        self._request_webpage(
            login_url, None, 'Logging in', 'Failed to log in',
            data=urlencode_postdata({
                'action': 'processXdget',
                'xdgetId': xdget_id,
                'params[action]': 'login',
                'params[url]': login_url,
                'params[object_type]': 'cms_page',
                'params[object_id]': -1,
                'params[email]': username,
                'params[password]': password,
                'requestTime': int(time.time()),
                'requestSimpleSign': simple_sign,
            }))
    def _real_extract(self, url):
        username, password = self._get_login_info()
        if username:
            self._login(url, username, password)
        display_id = self._match_id(url)
        # NB: 404 is returned due to yt-dlp not properly following redirects #9020
        webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=404)
        if self._LOGIN_URL_PATH in urlh.url or urlh.status == 404:
            self.raise_login_required()
        playlist_id = self._search_regex(
            r'window\.(?:lessonId|gcsObjectId)\s*=\s*(\d+)', webpage, 'playlist id', default=display_id)
        title = self._html_extract_title(webpage) or self._og_search_title(webpage)
        return self.playlist_from_matches(
            re.findall(GetCourseRuPlayerIE._EMBED_REGEX[0], webpage),
            playlist_id, title, display_id=display_id, ie=GetCourseRuPlayerIE, video_kwargs={
                'url_transparent': True,
                'title': title,
            })
Author	SHA1	Message	Date
sepro	c765ee8f48	Cleanup imports	2024-01-18 06:31:43 +01:00
sepro	0cf1aa652d	Extract proper id for pages	2024-01-18 06:31:19 +01:00
sepro	17e21cee51	Don't abort on no login Some pages exist that can be accessed without logging in	2024-01-18 06:30:51 +01:00
sepro	273782d14d	Add simple sign to login The site sends this. Seems like a csrf value. Even though it is unchecked, better to send it imo.	2024-01-18 06:29:50 +01:00
sepro	a2d1d06793	Refactor login code	2024-01-18 06:28:58 +01:00
sepro	799f5a6737	Only login if username/password is passed	2024-01-18 06:27:38 +01:00
sepro	d60d40abdd	Fix regex	2024-01-18 06:27:03 +01:00
sepro	7bbcc4e89d	Extract og title as fallback. Web interface allows both to be set independently	2024-01-18 06:26:13 +01:00
sepro	9ac446d9ae	Use embed regex for consistency to find iframes	2024-01-18 06:25:48 +01:00
sepro	db82da757c	Add marafon.mani-beauty.com domain	2024-01-18 06:25:13 +01:00
sepro	d7b18d9b55	Add getcourse.io test	2024-01-18 06:25:02 +01:00
sepro	d7fa3c48a3	Use gcFileId as this is also used in webpage	2024-01-18 06:24:46 +01:00
sepro	b5748f60e7	Add Embed regex	2024-01-18 06:24:12 +01:00
sepro	36c6dd1c7a	Only match exactly player02 and add getcourse.io	2024-01-18 06:24:01 +01:00
Dr. Steven Strange	7149abed68	[GetCourseRuPlayerIE]&[GetCourseRuIE] added another test-case, that was successfully tested (with proper credentials)	2024-01-18 02:12:36 +01:00
Dr. Steven Strange	b0302b510c	[GetCourseRuPlayerIE]&[GetCourseRuIE] xdgetId is now parsed from login page	2024-01-18 02:03:40 +01:00
Dr. Steven Strange	24bb5404b8	[GetCourseRuPlayerIE]&[GetCourseRuIE] fixing flake8 remarks, login-related TODO	2024-01-18 01:38:42 +01:00
Dr. Steven Strange	2323a99538	[GetCourseRuPlayerIE]&[GetCourseRuIE] fixing URLs and playlist_id	2024-01-18 01:29:37 +01:00
Dr. Steven Strange	7759ab6205	[GetCourseRuPlayerIE]&[GetCourseRuIE] adding more generic getcourse.ru extractors	2024-01-17 01:41:29 +01:00