Compare commits

...

7 Commits

Author SHA1 Message Date
Kieran
107b08795e
Merge 19fa262ecb into da252d9d32 2024-11-18 02:47:53 +02:00
bashonly
19fa262ecb
ruff fixes
Authored by: bashonly
2024-11-15 20:21:08 -06:00
bashonly
709fa17030
Merge branch 'yt-dlp:master' into pr/9894 2024-11-15 20:20:31 -06:00
Kieran Eglin
311dc3d438
Addl. regex refactoring 2024-05-09 10:57:13 -07:00
Kieran Eglin
620d721779
Refactored based on feedback 2024-05-09 10:55:22 -07:00
Kieran Eglin
6d23661542
Updated playlist regex 2024-05-09 10:37:51 -07:00
Kieran Eglin
d816fb28dc
Added 30 day singer extractor 2024-05-09 09:39:23 -07:00
2 changed files with 105 additions and 0 deletions

View File

@ -2082,6 +2082,7 @@ from .theplatform import (
from .thestar import TheStarIE from .thestar import TheStarIE
from .thesun import TheSunIE from .thesun import TheSunIE
from .theweatherchannel import TheWeatherChannelIE from .theweatherchannel import TheWeatherChannelIE
from .thirtydaysinger import ThirtyDaySingerIE, ThirtyDaySingerPlaylistIE
from .thisamericanlife import ThisAmericanLifeIE from .thisamericanlife import ThisAmericanLifeIE
from .thisoldhouse import ThisOldHouseIE from .thisoldhouse import ThisOldHouseIE
from .thisvid import ( from .thisvid import (

View File

@ -0,0 +1,104 @@
import re
from .wistia import WistiaBaseIE
from ..utils import clean_html, get_elements_html_by_class
class ThirtyDaySingerBase(WistiaBaseIE):
_INDEX_EXTRACTION_RE = r'/tutorial/[\w-]+/(?P<index>[\w-]+)'
def _extract_for_url(self, url):
lesson_index = re.search(self._INDEX_EXTRACTION_RE, url).group('index')
webpage = self._download_webpage(url, lesson_index)
match = next(self._extract_wistia_async_embed(webpage))
embed_config = self._download_embed_config('medias', match.group('id'), url)
embed_infojson = self._extract_media(embed_config)
webpage_infojson = self._extract_webpage_data(webpage)
return {**embed_infojson, **webpage_infojson}
def _extract_webpage_data(self, webpage):
title = self._html_search_regex(r'<h1>([^<]+)</h1>', webpage, 'title')
fallback_title = self._html_extract_title(webpage)
description = self._html_search_meta('description', webpage, fatal=False)
return {
'title': title or fallback_title,
'description': clean_html(self._format_html_list(description)),
}
# The site makes extensive use of HTML lists for formatting and `clean_html`
# doesn't handle them well. This is needed to keep lists readable.
def _format_html_list(self, html):
replacements = {
'<ul>': '<br>',
'</ul': '<br>',
'<li>': '<br>- ',
'</li>': '',
}
for k, v in replacements.items():
html = html.replace(k, v)
return html
class ThirtyDaySingerIE(ThirtyDaySingerBase):
_VALID_URL = r'https?://www\.30daysinger\.com/tutorial/[\w-]+/[\w-]+'
_TESTS = [{
'url': 'https://www.30daysinger.com/tutorial/30-day-beginner-course-with-jonathan-estabrooks/1',
'md5': '56bb11529b9777899b27b599d4b16cf6',
'info_dict': {
'id': 'tegd38l3d5',
'ext': 'mp4',
'thumbnail': 'http://embed.wistia.com/deliveries/c26a85cb98e32efa8a5e12a0576e63355af66230.jpg',
'upload_date': '20190608',
'description': 'md5:d3291de8988be57b1d3e411126ba4d33',
'duration': 344.22,
'timestamp': 1559952526,
'title': 'Welcome to 30 Day Singer',
},
}]
def _real_extract(self, url):
return self._extract_for_url(url)
class ThirtyDaySingerPlaylistIE(ThirtyDaySingerBase):
_URI_BASE = 'https://www.30daysinger.com'
_VALID_URL = r'https?://www\.30daysinger\.com/tutorial/(?P<playlist_id>[\w-]+)/?(?:$|[#?])'
_TESTS = [{
'url': 'https://www.30daysinger.com/tutorial/30-day-beginner-course-with-jonathan-estabrooks',
'info_dict': {
'id': '30-day-beginner-course-with-jonathan-estabrooks',
'description': 'md5:8cf6d6c7c377895653c9cde9dfc4104f',
'title': '30 Day Beginner Course with Jonathan Estabrooks',
},
'playlist_count': 1,
'expected_warnings': ['This video is for premium members only'],
}]
def _real_extract(self, url):
playlist_id = self._match_valid_url(url).group('playlist_id')
webpage = self._download_webpage(url, playlist_id)
playlist_attrs = self._extract_webpage_data(webpage)
entries = []
for html_element in get_elements_html_by_class('playlist-item-link', webpage):
href = self._search_regex(r'href="([^"]+)"', html_element, 'href')
if not href:
continue
# Often _some_ content is free so we should still download that but warn the user
# when we encounter premium content.
# NOTE: this only applies to the playlist extractor, not the single video extractor
if 'upgrade' in href:
self.report_warning('This video is for premium members only')
continue
entries.append(self._extract_for_url(self._URI_BASE + href))
return self.playlist_result(entries, playlist_id, playlist_attrs['title'], playlist_attrs['description'])