Compare commits

...

24 Commits

Author SHA1 Message Date
MMM
1ffc6f95bd
Merge a91d9e1084 into e079ffbda6 2024-11-18 03:41:01 +05:30
gillux
e079ffbda6
[ie/litv] Fix extractor (#11071)
Authored by: jiru
2024-11-17 21:37:15 +00:00
bashonly
2009cb27e1
[ie/SonyLIVSeries] Add sort_order extractor-arg (#11569)
Authored by: bashonly
2024-11-17 21:16:22 +00:00
Jackson Humphrey
f351440f1d
[ie/ctvnews] Fix extractor (#11534)
Closes #8689
Authored by: jshumphrey, bashonly

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
2024-11-17 21:06:50 +00:00
qbnu
f9d98509a8
[ie/ctvnews] Fix playlist ID extraction (#8892)
Authored by: qbnu
2024-11-17 19:35:10 +00:00
sepro
37cd7660ea
[ie/youtube:tab] Fix podcasts tab extraction (#11567)
Authored by: seproDev
2024-11-17 19:46:04 +01:00
ChocoLZS
d867f99622
[ie/PiaLive] Add extractor (#10811)
Authored by: ChocoLZS
2024-11-17 19:41:57 +01:00
doe1080
10fc719bc7
[cleanup] Remove dead extractors (#11566)
- Removes MildomClipIE, MildomIE, MildomUserVodIE, MildomVodIE
- Removes PokemonIE, PokemonWatchIE
- Removes VeohIE, VeohUserIE

Closes #3373, Closes #7059
Authored by: doe1080
2024-11-17 16:22:40 +00:00
flashdagger
a91d9e1084
[parsing] support comment end tag '--!>' as suggested by github-advanced-security bot 2023-11-13 07:19:42 +01:00
flashdagger
c34166d7c8
[parsing] support uppercase SCRIPT tags as suggested by github-advanced-security bot 2023-11-13 06:54:28 +01:00
flashdagger
b35550248a
Merge branch 'master' into html-parsing-relaxed 2023-11-13 06:40:03 +01:00
Marcel
7a9dd3d35f
[parsing] inline tag_obj.closerange() 2023-03-18 18:38:49 +01:00
Marcel
8d87bb4d91
[parsing] unify tag nesting 2023-03-18 18:38:48 +01:00
Marcel
65f91148fc
[parsing] search for case-insensitive tag names 2023-03-18 18:38:48 +01:00
Marcel
6169b3eca8
[parsing] replace HTMLCommentRanges with HTMLIgnoreRanges
* ignore matches within CDATA elements and comments
2023-03-18 18:38:47 +01:00
Marcel
29278a3323
[parsing] fix return value 2023-03-18 18:38:46 +01:00
Marcel
7a67a2028f
[parsing] tweak tag regex 2023-03-18 18:38:46 +01:00
Marcel
dbf350c122
[parsing] return unclosed matched tags 2023-03-18 18:38:45 +01:00
Marcel
8451074b50
[parsing] fix: don't push unmatched void tags onto queue 2023-03-18 18:38:45 +01:00
Marcel
176a156c65
[parsing] rework interface, implemented all get_element(s) functions + extract_attributes() as MatchingElementParser class methods and improve performance 2023-03-18 18:38:44 +01:00
Marcel
e092ba9922
[test] rollback test_utils.py and add related tests to test_parsing.py 2023-03-18 18:38:44 +01:00
Marcel
5e3894df3f
[parsing] add new module containing various HTML parser classes as replacement for utils.get_html_... functions
* performance is mostly better for large HTML data and on PyPy
2023-03-18 18:38:43 +01:00
Marcel
af03fa4542
[utils] more forgiving html parsing + unit tests 2023-03-18 18:38:43 +01:00
Marcel
da0d84258b
[test/test_utils] refactor test_get_element_text_and_html_by_tag() 2023-03-18 18:38:37 +01:00
15 changed files with 1170 additions and 820 deletions

View File

@ -1869,6 +1869,9 @@ The following extractors use this feature:
#### digitalconcerthall
* `prefer_combined_hls`: Prefer extracting combined/pre-merged video and audio HLS formats. This will exclude 4K/HEVC video and lossless/FLAC audio formats, which are only available as split video/audio HLS formats
#### sonylivseries
* `sort_order`: Episode sort order for series extraction - one of `asc` (ascending, oldest first) or `desc` (descending, newest first). Default is `asc`
**Note**: These options may be changed/removed in the future without concern for backward compatibility
<!-- MANPAGE: MOVE "INSTALLATION" SECTION HERE -->

359
test/test_parsing.py Normal file
View File

@ -0,0 +1,359 @@
import textwrap
import unittest
from yt_dlp.compat import compat_HTMLParseError
from yt_dlp.parsing import (
MatchingElementParser,
HTMLIgnoreRanges,
HTMLTagParser,
)
extract_attributes = MatchingElementParser.extract_attributes
get_element_by_attribute = MatchingElementParser.get_element_by_attribute
get_element_by_class = MatchingElementParser.get_element_by_class
get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute
get_element_html_by_class = MatchingElementParser.get_element_html_by_class
get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag
get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute
get_elements_by_class = MatchingElementParser.get_elements_by_class
get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute
get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class
get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute
get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag
class TestParsing(unittest.TestCase):
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'}) # XML
self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'})
self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'}) # HTML 3.2
self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'}) # HTML 4.0
self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
self.assertEqual(extract_attributes('<e x >'), {'x': None})
self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'})
# "Narrow" Python builds don't support unicode code points outside BMP.
try:
chr(0x10000)
supports_outside_bmp = True
except ValueError:
supports_outside_bmp = False
if supports_outside_bmp:
self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
# Malformed HTML should not break attributes extraction on older Python
self.assertEqual(extract_attributes('<mal"formed/>'), {})
GET_ELEMENT_BY_CLASS_TEST_STRING = '''
<span class="foo bar">nice</span>
<div class="foo bar">also nice</div>
'''
def test_get_element_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
def test_get_element_html_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_class('foo', html),
'<span class="foo bar">nice</span>')
self.assertEqual(get_element_by_class('no-such-class', html), None)
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
<div itemprop="author" itemscope>foo</div>
'''
def test_get_element_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice')
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
def test_get_element_html_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html),
'<span class="foo bar">nice</span>')
self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
<span class="foo bar">nice</span>
<span class="foo bar">also nice</span>
'''
GET_ELEMENTS_BY_CLASS_RES = [
'<span class="foo bar">nice</span>',
'<span class="foo bar">also nice</span>'
]
def test_get_elements_by_class(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), [])
def test_get_elements_html_by_class(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
def test_get_elements_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html),
self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_text_and_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(
get_elements_text_and_html_by_attribute('class', 'foo bar', html),
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute(
'class', 'foo', '<a class="foo">nice</a><span class="foo">not nice</span>', tag='a'),
[('nice', '<a class="foo">nice</a>')])
def test_get_element_text_and_html_by_tag(self):
get_element_by_tag_test_string = '''
random text lorem ipsum</p>
<div>
this should be returned
<span>this should also be returned</span>
<div>
this should also be returned
</div>
closing tag above should not trick, so this should also be returned
</div>
but this text should not be returned
'''
html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
get_element_by_tag_res_innerspan_html = html.strip()[78:119]
get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
self.assertEqual(
get_element_text_and_html_by_tag('div', html),
(get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
self.assertIsNone(get_element_text_and_html_by_tag('article', html))
def test_get_elements_text_and_html_by_tag(self):
class StrictParser(MatchingElementParser):
STRICT = True
test_string = '''
<img src="a.png">
<img src="b.png" />
<span>ignore</span>
'''
items = get_elements_text_and_html_by_tag('img', test_string)
self.assertEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
self.assertEqual(
StrictParser.get_element_text_and_html_by_tag('use', '<use><img></use>'),
('<img>', '<use><img></use>'))
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner text'
malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
commented_html = '<!--<div>inner comment</div>-->'
outerdiv_html = f'<div>{malnested_elements}</div>'
html = f'{commented_html}{outerdiv_html}'
self.assertEqual(
get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_a', html),
(f'<malnested_b>{inner_text}',
f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}</malnested_a>',
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
self.assertEqual(
get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
# ignore case on tags
ci_html = f'<SpAn>{html}</sPaN>'
self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html))
def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser):
STRICT = True
parser = StrictTagParser()
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
parser.taglist('</p>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
parser.taglist('<div><p>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
parser.taglist('<div><p></div></p>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
parser.taglist('<div><p>/p></div>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
parser.taglist('<div><p></p<< </div>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
parser.taglist('<img>must be empty</img>', reset=True)
def test_relaxed_html_parsing(self):
Tag = HTMLTagParser.Tag
parser = HTMLTagParser()
self.assertEqual(parser.taglist('</p>', reset=True), [])
tags = parser.taglist('<div><p>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('', '<div>'))
self.assertEqual(tags[1].text_and_html(), ('', '<p>'))
tags = parser.taglist('<div><p></div></p>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('<p>', '<div><p></div>'))
self.assertEqual(tags[1].text_and_html(), ('</div>', '<p></div></p>'))
tags = parser.taglist('<div><p>/p></div>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('<p>/p>', '<div><p>/p></div>'))
self.assertEqual(tags[1].text_and_html(), ('', '<p>'))
tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(),
('<p>paragraph</p<ignored>', '<div><p>paragraph</p<ignored></div>'))
self.assertEqual(tags[1].text_and_html(), ('paragraph', '<p>paragraph</p<ignored>'))
tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
self.assertEqual(tags, [Tag('img')])
self.assertEqual(tags[0].text_and_html(), ('', '<img width="300px">'))
def test_compliant_html_parsing(self):
# certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
Tag = HTMLTagParser.Tag
html = '''
no error without closing tag: <img>
self closing is ok: <img />
'''
parser = HTMLTagParser()
tags = parser.taglist(html, reset=True)
self.assertEqual(tags, [Tag('img'), Tag('img')])
# don't get fooled by '>' in attributes
html = '''<img greater_a='1>0' greater_b="1>0">'''
tags = parser.taglist(html, reset=True)
self.assertEqual(tags[0].text_and_html(), ('', html))
def test_tag_return_order(self):
Tag = HTMLTagParser.Tag
html = '''
<t0>
<t1>
<t2>
<t3 /> <t4 />
</t2>
</t1>
<t5>
<t6 />
</t5>
</t0>
<t7>
<t8 />
</t7>
'''
parser = HTMLTagParser()
tags = parser.taglist(html, reset=True)
self.assertEqual(
str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
tags = parser.taglist(html, reset=True, depth_first=True)
self.assertEqual(
str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
# return tags in nested order
tags = parser.taglist(html, reset=True, depth_first=None)
self.assertEqual(
str(tags), str([
[Tag('t0'),
[Tag('t1'),
[Tag('t2'), [Tag('t3')], [Tag('t4')]]],
[Tag('t5'), [Tag('t6')]]],
[Tag('t7'), [Tag('t8')]]]))
def test_html_ignored_ranges(self):
def mark_comments(_string, char='^', nochar='-'):
cmts = HTMLIgnoreRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
html_string = '''
no comments in this line
---------------------------------------------------------------------
<!-- whole line represents a comment -->
----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
before <!-- comment --> after
-----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
this is a leftover comment --> <!-- a new comment without closing
^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
here is <!-- a comment --> and <!-- another comment --!> end
----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
<script> ignore here </script> <SCRIPT> and here </SCRIPT>
--------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
'''
lines = textwrap.dedent(html_string).strip().splitlines()
for line, marker in zip(lines[0::2], lines[1::2]):
self.assertEqual((line, mark_comments(line)), (line, marker))
# yet we must be able to match script elements
test_string = '''<script type="text/javascript">var foo = 'bar';</script>'''
items = get_element_text_and_html_by_tag('script', test_string)
self.assertEqual(items, ("var foo = 'bar';", test_string))

View File

@ -1139,12 +1139,6 @@ from .microsoftembed import (
MicrosoftMediusIE,
)
from .microsoftstream import MicrosoftStreamIE
from .mildom import (
MildomClipIE,
MildomIE,
MildomUserVodIE,
MildomVodIE,
)
from .minds import (
MindsChannelIE,
MindsGroupIE,
@ -1526,8 +1520,8 @@ from .pgatour import PGATourIE
from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
from .pialive import PiaLiveIE
from .piapro import PiaproIE
from .piaulizaportal import PIAULIZAPortalIE
from .picarto import (
PicartoIE,
PicartoVodIE,
@ -1563,10 +1557,6 @@ from .podbayfm import (
)
from .podchaser import PodchaserIE
from .podomatic import PodomaticIE
from .pokemon import (
PokemonIE,
PokemonWatchIE,
)
from .pokergo import (
PokerGoCollectionIE,
PokerGoIE,
@ -2260,6 +2250,10 @@ from .ufctv import (
)
from .ukcolumn import UkColumnIE
from .uktvplay import UKTVPlayIE
from .uliza import (
UlizaPlayerIE,
UlizaPortalIE,
)
from .umg import UMGDeIE
from .unistra import UnistraIE
from .unity import UnityIE
@ -2288,10 +2282,6 @@ from .utreon import UtreonIE
from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE
from .veo import VeoIE
from .veoh import (
VeohIE,
VeohUserIE,
)
from .vesti import VestiIE
from .vevo import (
VevoIE,

View File

@ -1,14 +1,27 @@
import json
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import orderedSet
from .ninecninemedia import NineCNineMediaIE
from ..utils import extract_attributes, orderedSet
from ..utils.traversal import find_element, traverse_obj
class CTVNewsIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)'
_BASE_REGEX = r'https?://(?:[^.]+\.)?ctvnews\.ca/'
_VIDEO_ID_RE = r'(?P<id>\d{5,})'
_PLAYLIST_ID_RE = r'(?P<id>\d\.\d{5,})'
_VALID_URL = [
rf'{_BASE_REGEX}video/c{_VIDEO_ID_RE}',
rf'{_BASE_REGEX}video(?:-gallery)?/?\?clipId={_VIDEO_ID_RE}',
rf'{_BASE_REGEX}video/?\?(?:playlist|bin)Id={_PLAYLIST_ID_RE}',
rf'{_BASE_REGEX}(?!video/)[^?#]*?{_PLAYLIST_ID_RE}/?(?:$|[?#])',
rf'{_BASE_REGEX}(?!video/)[^?#]+\?binId={_PLAYLIST_ID_RE}',
]
_TESTS = [{
'url': 'http://www.ctvnews.ca/video?clipId=901995',
'md5': '9b8624ba66351a23e0b6e1391971f9af',
'md5': 'b608f466c7fa24b9666c6439d766ab7e',
'info_dict': {
'id': '901995',
'ext': 'flv',
@ -16,6 +29,33 @@ class CTVNewsIE(InfoExtractor):
'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285',
'timestamp': 1467286284,
'upload_date': '20160630',
'categories': [],
'season_number': 0,
'season': 'Season 0',
'tags': [],
'series': 'CTV News National | Archive | Stories 2',
'season_id': '57981',
'thumbnail': r're:https?://.*\.jpg$',
'duration': 764.631,
},
}, {
'url': 'https://barrie.ctvnews.ca/video/c3030933-here_s-what_s-making-news-for-nov--15?binId=1272429',
'md5': '8b8c2b33c5c1803e3c26bc74ff8694d5',
'info_dict': {
'id': '3030933',
'ext': 'flv',
'title': 'Heres whats making news for Nov. 15',
'description': 'Here are the top stories were working on for CTV News at 11 for Nov. 15',
'thumbnail': 'http://images2.9c9media.com/image_asset/2021_2_22_a602e68e-1514-410e-a67a-e1f7cccbacab_png_2000x1125.jpg',
'season_id': '58104',
'season_number': 0,
'tags': [],
'season': 'Season 0',
'categories': [],
'series': 'CTV News Barrie',
'upload_date': '20241116',
'duration': 42.943,
'timestamp': 1731722452,
},
}, {
'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224',
@ -31,6 +71,72 @@ class CTVNewsIE(InfoExtractor):
'id': '1.2876780',
},
'playlist_mincount': 100,
}, {
'url': 'https://www.ctvnews.ca/it-s-been-23-years-since-toronto-called-in-the-army-after-a-major-snowstorm-1.5736957',
'info_dict':
{
'id': '1.5736957',
},
'playlist_mincount': 6,
}, {
'url': 'https://www.ctvnews.ca/business/respondents-to-bank-of-canada-questionnaire-largely-oppose-creating-a-digital-loonie-1.6665797',
'md5': '24bc4b88cdc17d8c3fc01dfc228ab72c',
'info_dict': {
'id': '2695026',
'ext': 'flv',
'season_id': '89852',
'series': 'From CTV News Channel',
'description': 'md5:796a985a23cacc7e1e2fafefd94afd0a',
'season': '2023',
'title': 'Bank of Canada asks public about digital currency',
'categories': [],
'tags': [],
'upload_date': '20230526',
'season_number': 2023,
'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg',
'timestamp': 1685105157,
'duration': 253.553,
},
}, {
'url': 'https://stox.ctvnews.ca/video-gallery?clipId=582589',
'md5': '135cc592df607d29dddc931f1b756ae2',
'info_dict': {
'id': '582589',
'ext': 'flv',
'categories': [],
'timestamp': 1427906183,
'season_number': 0,
'duration': 125.559,
'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg',
'series': 'CTV News Stox',
'description': 'CTV original footage of the rise and fall of the Berlin Wall.',
'title': 'Berlin Wall',
'season_id': '63817',
'season': 'Season 0',
'tags': [],
'upload_date': '20150401',
},
}, {
'url': 'https://ottawa.ctvnews.ca/features/regional-contact/regional-contact-archive?binId=1.1164587#3023759',
'md5': 'a14c0603557decc6531260791c23cc5e',
'info_dict': {
'id': '3023759',
'ext': 'flv',
'season_number': 2024,
'timestamp': 1731798000,
'season': '2024',
'episode': 'Episode 125',
'description': 'CTV News Ottawa at Six',
'duration': 2712.076,
'episode_number': 125,
'upload_date': '20241116',
'title': 'CTV News Ottawa at Six for Saturday, November 16, 2024',
'thumbnail': 'http://images2.9c9media.com/image_asset/2019_3_28_35f5afc3-10f6-4d92-b194-8b9a86f55c6a_png_1920x1080.jpg',
'categories': [],
'tags': [],
'series': 'CTV News Ottawa at Six',
'season_id': '92667',
},
}, {
'url': 'http://www.ctvnews.ca/1.810401',
'only_matching': True,
@ -42,29 +148,35 @@ class CTVNewsIE(InfoExtractor):
'only_matching': True,
}]
def _ninecninemedia_url_result(self, clip_id):
return self.url_result(f'9c9media:ctvnews_web:{clip_id}', NineCNineMediaIE, clip_id)
def _real_extract(self, url):
page_id = self._match_id(url)
def ninecninemedia_url_result(clip_id):
return {
'_type': 'url_transparent',
'id': clip_id,
'url': f'9c9media:ctvnews_web:{clip_id}',
'ie_key': 'NineCNineMedia',
}
if mobj := re.fullmatch(self._VIDEO_ID_RE, urllib.parse.urlparse(url).fragment):
page_id = mobj.group('id')
if page_id.isdigit():
return ninecninemedia_url_result(page_id)
else:
webpage = self._download_webpage(f'http://www.ctvnews.ca/{page_id}', page_id, query={
'ot': 'example.AjaxPageLayout.ot',
'maxItemsPerPage': 1000000,
})
entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
if not entries:
webpage = self._download_webpage(url, page_id)
if 'getAuthStates("' in webpage:
entries = [ninecninemedia_url_result(clip_id) for clip_id in
self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')]
return self.playlist_result(entries, page_id)
if re.fullmatch(self._VIDEO_ID_RE, page_id):
return self._ninecninemedia_url_result(page_id)
webpage = self._download_webpage(f'https://www.ctvnews.ca/{page_id}', page_id, query={
'ot': 'example.AjaxPageLayout.ot',
'maxItemsPerPage': 1000000,
})
entries = [self._ninecninemedia_url_result(clip_id)
for clip_id in orderedSet(re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
if not entries:
webpage = self._download_webpage(url, page_id)
if 'getAuthStates("' in webpage:
entries = [self._ninecninemedia_url_result(clip_id) for clip_id in
self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')]
else:
entries = [
self._ninecninemedia_url_result(clip_id) for clip_id in
traverse_obj(webpage, (
{find_element(tag='jasper-player-container', html=True)},
{extract_attributes}, 'axis-ids', {json.loads}, ..., 'axisId'))
]
return self.playlist_result(entries, page_id)

View File

@ -1,30 +1,32 @@
import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
join_nonempty,
smuggle_url,
traverse_obj,
try_call,
unsmuggle_url,
urljoin,
)
class LiTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)'
_URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?content_id=%s'
_VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:[^/?#]+/watch/|vod/[^/?#]+/content\.do\?content_id=)(?P<id>[\w-]+)'
_URL_TEMPLATE = 'https://www.litv.tv/%s/watch/%s'
_GEO_COUNTRIES = ['TW']
_TESTS = [{
'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
'url': 'https://www.litv.tv/drama/watch/VOD00041610',
'info_dict': {
'id': 'VOD00041606',
'title': '花千骨',
},
'playlist_count': 51, # 50 episodes + 1 trailer
}, {
'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
'url': 'https://www.litv.tv/drama/watch/VOD00041610',
'md5': 'b90ff1e9f1d8f5cfcd0a44c3e2b34c7a',
'info_dict': {
'id': 'VOD00041610',
@ -32,16 +34,15 @@ class LiTVIE(InfoExtractor):
'title': '花千骨第1集',
'thumbnail': r're:https?://.*\.jpg$',
'description': '《花千骨》陸劇線上看。十六年前,平靜的村莊內,一名女嬰隨異相出生,途徑此地的蜀山掌門清虛道長算出此女命運非同一般,她體內散發的異香易招惹妖魔。一念慈悲下,他在村莊周邊設下結界阻擋妖魔入侵,讓其年滿十六後去蜀山,並賜名花千骨。',
'categories': ['奇幻', '愛情', '中國', '仙俠'],
'categories': ['奇幻', '愛情', '仙俠', '古裝'],
'episode': 'Episode 1',
'episode_number': 1,
},
'params': {
'noplaylist': True,
},
'skip': 'Georestricted to Taiwan',
}, {
'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&',
'url': 'https://www.litv.tv/drama/watch/VOD00044841',
'md5': '88322ea132f848d6e3e18b32a832b918',
'info_dict': {
'id': 'VOD00044841',
@ -55,94 +56,62 @@ class LiTVIE(InfoExtractor):
def _extract_playlist(self, playlist_data, content_type):
all_episodes = [
self.url_result(smuggle_url(
self._URL_TEMPLATE % (content_type, episode['contentId']),
self._URL_TEMPLATE % (content_type, episode['content_id']),
{'force_noplaylist': True})) # To prevent infinite recursion
for episode in traverse_obj(playlist_data, ('seasons', ..., 'episode', lambda _, v: v['contentId']))]
for episode in traverse_obj(playlist_data, ('seasons', ..., 'episodes', lambda _, v: v['content_id']))]
return self.playlist_result(all_episodes, playlist_data['contentId'], playlist_data.get('title'))
return self.playlist_result(all_episodes, playlist_data['content_id'], playlist_data.get('title'))
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
vod_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']
if self._search_regex(
r'(?i)<meta\s[^>]*http-equiv="refresh"\s[^>]*content="[0-9]+;\s*url=https://www\.litv\.tv/"',
webpage, 'meta refresh redirect', default=False, group=0):
raise ExtractorError('No such content found', expected=True)
program_info = traverse_obj(vod_data, ('programInformation', {dict})) or {}
playlist_data = traverse_obj(vod_data, ('seriesTree'))
if playlist_data and self._yes_playlist(program_info.get('series_id'), video_id, smuggled_data):
return self._extract_playlist(playlist_data, program_info.get('content_type'))
program_info = self._parse_json(self._search_regex(
r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
video_id)
asset_id = traverse_obj(program_info, ('assets', 0, 'asset_id', {str}))
if asset_id: # This is a VOD
media_type = 'vod'
else: # This is a live stream
asset_id = program_info['content_id']
media_type = program_info['content_type']
puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
if puid:
endpoint = 'get-urls'
else:
puid = str(uuid.uuid4())
endpoint = 'get-urls-no-auth'
video_data = self._download_json(
f'https://www.litv.tv/api/{endpoint}', video_id,
data=json.dumps({'AssetId': asset_id, 'MediaType': media_type, 'puid': puid}).encode(),
headers={'Content-Type': 'application/json'})
# In browsers `getProgramInfo` request is always issued. Usually this
# endpoint gives the same result as the data embedded in the webpage.
# If, for some reason, there are no embedded data, we do an extra request.
if 'assetId' not in program_info:
program_info = self._download_json(
'https://www.litv.tv/vod/ajax/getProgramInfo', video_id,
query={'contentId': video_id},
headers={'Accept': 'application/json'})
series_id = program_info['seriesId']
if self._yes_playlist(series_id, video_id, smuggled_data):
playlist_data = self._download_json(
'https://www.litv.tv/vod/ajax/getSeriesTree', video_id,
query={'seriesId': series_id}, headers={'Accept': 'application/json'})
return self._extract_playlist(playlist_data, program_info['contentType'])
video_data = self._parse_json(self._search_regex(
r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
webpage, 'video data', default='{}'), video_id)
if not video_data:
payload = {'assetId': program_info['assetId']}
puid = try_call(lambda: self._get_cookies('https://www.litv.tv/')['PUID'].value)
if puid:
payload.update({
'type': 'auth',
'puid': puid,
})
endpoint = 'getUrl'
else:
payload.update({
'watchDevices': program_info['watchDevices'],
'contentType': program_info['contentType'],
})
endpoint = 'getMainUrlNoAuth'
video_data = self._download_json(
f'https://www.litv.tv/vod/ajax/{endpoint}', video_id,
data=json.dumps(payload).encode(),
headers={'Content-Type': 'application/json'})
if not video_data.get('fullpath'):
error_msg = video_data.get('errorMessage')
if error_msg == 'vod.error.outsideregionerror':
if error := traverse_obj(video_data, ('error', {dict})):
error_msg = traverse_obj(error, ('message', {str}))
if error_msg and 'OutsideRegionError' in error_msg:
self.raise_geo_restricted('This video is available in Taiwan only')
if error_msg:
elif error_msg:
raise ExtractorError(f'{self.IE_NAME} said: {error_msg}', expected=True)
raise ExtractorError(f'Unexpected result from {self.IE_NAME}')
raise ExtractorError(f'Unexpected error from {self.IE_NAME}')
formats = self._extract_m3u8_formats(
video_data['fullpath'], video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls')
video_data['result']['AssetURLs'][0], video_id, ext='mp4', m3u8_id='hls')
for a_format in formats:
# LiTV HLS segments doesn't like compressions
a_format.setdefault('http_headers', {})['Accept-Encoding'] = 'identity'
title = program_info['title'] + program_info.get('secondaryMark', '')
description = program_info.get('description')
thumbnail = program_info.get('imageFile')
categories = [item['name'] for item in program_info.get('category', [])]
episode = int_or_none(program_info.get('episode'))
return {
'id': video_id,
'formats': formats,
'title': title,
'description': description,
'thumbnail': thumbnail,
'categories': categories,
'episode_number': episode,
'title': join_nonempty('title', 'secondary_mark', delim='', from_dict=program_info),
**traverse_obj(program_info, {
'description': ('description', {str}),
'thumbnail': ('picture', {urljoin('https://p-cdnstatic.svc.litv.tv/')}),
'categories': ('genres', ..., 'name', {str}),
'episode_number': ('episode', {int_or_none}),
}),
}

View File

@ -1,291 +0,0 @@
import functools
import json
import uuid
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
determine_ext,
dict_get,
float_or_none,
traverse_obj,
)
class MildomBaseIE(InfoExtractor):
_GUEST_ID = None
def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None):
if not self._GUEST_ID:
self._GUEST_ID = f'pc-gp-{uuid.uuid4()}'
content = self._download_json(
url, video_id, note=note, data=json.dumps(body).encode() if body else None,
headers={'Content-Type': 'application/json'} if body else {},
query={
'__guest_id': self._GUEST_ID,
'__platform': 'web',
**(query or {}),
})
if content['code'] != 0:
raise ExtractorError(
f'Mildom says: {content["message"]} (code {content["code"]})',
expected=True)
return content['body']
class MildomIE(MildomBaseIE):
IE_NAME = 'mildom'
IE_DESC = 'Record ongoing live by specific user in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/(?P<id>\d+)'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id)
enterstudio = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id,
note='Downloading live metadata', query={'user_id': video_id})
result_video_id = enterstudio.get('log_id', video_id)
servers = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id,
note='Downloading live server list', query={
'user_id': video_id,
'live_server_type': 'hls',
})
playback_token = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id,
note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'})
playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False)
if not playback_token:
raise ExtractorError('Failed to obtain live playback token')
formats = self._extract_m3u8_formats(
f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}',
result_video_id, 'mp4', headers={
'Referer': 'https://www.mildom.com/',
'Origin': 'https://www.mildom.com',
})
for fmt in formats:
fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/'
return {
'id': result_video_id,
'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'),
'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str),
'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000),
'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'),
'uploader_id': video_id,
'formats': formats,
'is_live': True,
}
class MildomVodIE(MildomBaseIE):
IE_NAME = 'mildom:vod'
IE_DESC = 'VOD in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)'
_TESTS = [{
'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269',
'info_dict': {
'id': '10882672-1597662269',
'ext': 'mp4',
'title': '始めてのミルダム配信じゃぃ!',
'thumbnail': r're:^https?://.*\.(png|jpg)$',
'upload_date': '20200817',
'duration': 4138.37,
'description': 'ゲームをしたくて!',
'timestamp': 1597662269.0,
'uploader_id': '10882672',
'uploader': 'kson組長(けいそん)',
},
}, {
'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477',
'info_dict': {
'id': '10882672-1597758589870-477',
'ext': 'mp4',
'title': '【kson】感染メイズ麻酔銃で無双する',
'thumbnail': r're:^https?://.*\.(png|jpg)$',
'timestamp': 1597759093.0,
'uploader': 'kson組長(けいそん)',
'duration': 4302.58,
'uploader_id': '10882672',
'description': 'このステージ絶対乗り越えたい',
'upload_date': '20200818',
},
}, {
'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0',
'info_dict': {
'id': '10882672-buha9td2lrn97fk2jme0',
'ext': 'mp4',
'title': '【kson組長】CART RACER!!!',
'thumbnail': r're:^https?://.*\.(png|jpg)$',
'uploader_id': '10882672',
'uploader': 'kson組長(けいそん)',
'upload_date': '20201104',
'timestamp': 1604494797.0,
'duration': 4657.25,
'description': 'WTF',
},
}]
def _real_extract(self, url):
user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id)
autoplay = self._call_api(
'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id,
note='Downloading playback metadata', query={
'v_id': video_id,
})['playback']
formats = [{
'url': autoplay['audio_url'],
'format_id': 'audio',
'protocol': 'm3u8_native',
'vcodec': 'none',
'acodec': 'aac',
'ext': 'm4a',
}]
for fmt in autoplay['video_link']:
formats.append({
'format_id': 'video-{}'.format(fmt['name']),
'url': fmt['url'],
'protocol': 'm3u8_native',
'width': fmt['level'] * autoplay['video_width'] // autoplay['video_height'],
'height': fmt['level'],
'vcodec': 'h264',
'acodec': 'aac',
'ext': 'mp4',
})
return {
'id': video_id,
'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'),
'description': traverse_obj(autoplay, 'video_intro'),
'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000),
'duration': float_or_none(autoplay.get('video_length'), scale=1000),
'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')),
'uploader': traverse_obj(autoplay, ('author_info', 'login_name')),
'uploader_id': user_id,
'formats': formats,
}
class MildomClipIE(MildomBaseIE):
IE_NAME = 'mildom:clip'
IE_DESC = 'Clip in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9',
'info_dict': {
'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9',
'title': '全然違ったよ',
'timestamp': 1619181890,
'duration': 59,
'thumbnail': r're:https?://.+',
'uploader': 'ざきんぽ',
'uploader_id': '10042245',
},
}, {
'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864',
'info_dict': {
'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864',
'title': 'かっこいい',
'timestamp': 1621094003,
'duration': 59,
'thumbnail': r're:https?://.+',
'uploader': '(ルーキー',
'uploader_id': '10111524',
},
}, {
'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
'info_dict': {
'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
'title': '',
'timestamp': 1614769431,
'duration': 31,
'thumbnail': r're:https?://.+',
'uploader': 'ドルゴルスレンギーン=ダグワドルジ',
'uploader_id': '10660174',
},
}]
def _real_extract(self, url):
user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id)
clip_detail = self._call_api(
'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id,
note='Downloading playback metadata', query={
'clip_id': video_id,
})
return {
'id': video_id,
'title': self._html_search_meta(
('og:description', 'description'), webpage, default=None) or clip_detail.get('title'),
'timestamp': float_or_none(clip_detail.get('create_time')),
'duration': float_or_none(clip_detail.get('length')),
'thumbnail': clip_detail.get('cover'),
'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')),
'uploader_id': user_id,
'url': clip_detail['url'],
'ext': determine_ext(clip_detail.get('url'), 'mp4'),
}
class MildomUserVodIE(MildomBaseIE):
IE_NAME = 'mildom:user:vod'
IE_DESC = 'Download all VODs from specific user in Mildom'
_VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/profile/(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.mildom.com/profile/10093333',
'info_dict': {
'id': '10093333',
'title': 'Uploads from ねこばたけ',
},
'playlist_mincount': 732,
}, {
'url': 'https://www.mildom.com/profile/10882672',
'info_dict': {
'id': '10882672',
'title': 'Uploads from kson組長(けいそん)',
},
'playlist_mincount': 201,
}]
def _fetch_page(self, user_id, page):
page += 1
reply = self._call_api(
'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
user_id, note=f'Downloading page {page}', query={
'user_id': user_id,
'page': page,
'limit': '30',
})
if not reply:
return
for x in reply:
v_id = x.get('v_id')
if not v_id:
continue
yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}')
def _real_extract(self, url):
user_id = self._match_id(url)
self.to_screen(f'This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/{user_id}" instead')
profile = self._call_api(
'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id,
query={'user_id': user_id}, note='Downloading user profile')['user_info']
return self.playlist_result(
OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30),
user_id, f'Uploads from {profile["loginname"]}')

122
yt_dlp/extractor/pialive.py Normal file
View File

@ -0,0 +1,122 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_class,
multipart_encode,
str_or_none,
unified_timestamp,
url_or_none,
)
from ..utils.traversal import traverse_obj
class PiaLiveIE(InfoExtractor):
_VALID_URL = r'https?://player\.pia-live\.jp/stream/(?P<id>[\w-]+)'
_PLAYER_ROOT_URL = 'https://player.pia-live.jp/'
_PIA_LIVE_API_URL = 'https://api.pia-live.jp'
_API_KEY = 'kfds)FKFps-dms9e'
_TESTS = [{
'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krUDqGOwN4d61dCWQYOd6CTxl4hjya9dsfEZGsM4uGOUdax60lEI4twsXGXf7crmz8Gk__GhupTrWxA7RFRVt76',
'info_dict': {
'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84',
'display_id': '2431867_001',
'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)',
'live_status': 'was_live',
'comment_count': int,
},
'params': {
'getcomments': True,
'skip_download': True,
'ignore_no_formats_error': True,
},
'skip': 'The video is no longer available',
}, {
'url': 'https://player.pia-live.jp/stream/4JagFBEIM14s_hK9aXHKf3k3F3bY5eoHFQxu68TC6krJdu0GVBVbVy01IwpJ6J3qBEm3d9TCTt1d0eWpsZGj7DrOjVOmS7GAWGwyscMgiThopJvzgWC4H5b-7XQjAfRZ',
'info_dict': {
'id': '9ce8b8ba-f6d1-4d1f-83a0-18c3148ded93',
'display_id': '2431867_002',
'title': 'こながめでたい日2024の視聴ページ | PIA LIVE STREAM(ぴあライブストリーム)',
'live_status': 'was_live',
'comment_count': int,
},
'params': {
'getcomments': True,
'skip_download': True,
'ignore_no_formats_error': True,
},
'skip': 'The video is no longer available',
}]
def _extract_var(self, variable, html):
return self._search_regex(
rf'(?:var|const|let)\s+{variable}\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
html, f'variable {variable}', group='value')
def _real_extract(self, url):
video_key = self._match_id(url)
webpage = self._download_webpage(url, video_key)
program_code = self._extract_var('programCode', webpage)
article_code = self._extract_var('articleCode', webpage)
title = self._html_extract_title(webpage)
if get_element_html_by_class('play-end', webpage):
raise ExtractorError('The video is no longer available', expected=True, video_id=program_code)
if start_info := clean_html(get_element_by_class('play-waiting__date', webpage)):
date, time = self._search_regex(
r'(?P<date>\d{4}/\d{1,2}/\d{1,2})\([月火水木金土日]\)(?P<time>\d{2}:\d{2})',
start_info, 'start_info', fatal=False, group=('date', 'time'))
if date and time:
release_timestamp_str = f'{date} {time} +09:00'
release_timestamp = unified_timestamp(release_timestamp_str)
self.raise_no_formats(f'The video will be available after {release_timestamp_str}', expected=True)
return {
'id': program_code,
'title': title,
'live_status': 'is_upcoming',
'release_timestamp': release_timestamp,
}
payload, content_type = multipart_encode({
'play_url': video_key,
'api_key': self._API_KEY,
})
api_data_and_headers = {
'data': payload,
'headers': {'Content-Type': content_type, 'Referer': self._PLAYER_ROOT_URL},
}
player_tag_list = self._download_json(
f'{self._PIA_LIVE_API_URL}/perf/player-tag-list/{program_code}', program_code,
'Fetching player tag list', 'Unable to fetch player tag list', **api_data_and_headers)
return self.url_result(
extract_attributes(player_tag_list['data']['movie_one_tag'])['src'],
url_transparent=True, title=title, display_id=program_code,
__post_extractor=self.extract_comments(program_code, article_code, api_data_and_headers))
def _get_comments(self, program_code, article_code, api_data_and_headers):
chat_room_url = traverse_obj(self._download_json(
f'{self._PIA_LIVE_API_URL}/perf/chat-tag-list/{program_code}/{article_code}', program_code,
'Fetching chat info', 'Unable to fetch chat info', fatal=False, **api_data_and_headers),
('data', 'chat_one_tag', {extract_attributes}, 'src', {url_or_none}))
if not chat_room_url:
return
comment_page = self._download_webpage(
chat_room_url, program_code, 'Fetching comment page', 'Unable to fetch comment page',
fatal=False, headers={'Referer': self._PLAYER_ROOT_URL})
if not comment_page:
return
yield from traverse_obj(self._search_json(
r'var\s+_history\s*=', comment_page, 'comment list',
program_code, contains_pattern=r'\[(?s:.+)\]', fatal=False), (..., {
'timestamp': (0, {int}),
'author_is_uploader': (1, {lambda x: x == 2}),
'author': (2, {str}),
'text': (3, {str}),
'id': (4, {str_or_none}),
}))

View File

@ -1,70 +0,0 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
parse_qs,
time_seconds,
traverse_obj,
)
class PIAULIZAPortalIE(InfoExtractor):
IE_DESC = 'ulizaportal.jp - PIA LIVE STREAM'
_VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{
'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44',
'info_dict': {
'id': '005f18b7-e810-5618-cb82-0987c5755d44',
'title': 'プレゼンテーションプレイヤーのサンプル',
'live_status': 'not_live',
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}, {
'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1',
'info_dict': {
'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d',
'title': '【確認用】視聴サンプルページULIZA',
'live_status': 'not_live',
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0)))
if expires and expires <= time_seconds():
raise ExtractorError('The link is expired.', video_id=video_id, expected=True)
webpage = self._download_webpage(url, video_id)
player_data = self._download_webpage(
self._search_regex(
r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"',
webpage, 'player data url'),
video_id, headers={'Referer': 'https://ulizaportal.jp/'},
note='Fetching player data', errnote='Unable to fetch player data')
formats = self._extract_m3u8_formats(
self._search_regex(
r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data,
'm3u8 url', default=None),
video_id, fatal=False)
m3u8_type = self._search_regex(
r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None)
return {
'id': video_id,
'title': self._html_extract_title(webpage),
'formats': formats,
'live_status': {
'video': 'is_live',
'dvr': 'was_live', # short-term archives
}.get(m3u8_type, 'not_live'), # VOD or long-term archives
}

View File

@ -1,136 +0,0 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
extract_attributes,
int_or_none,
js_to_json,
merge_dicts,
)
class PokemonIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))'
_TESTS = [{
'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/',
'md5': '2fe8eaec69768b25ef898cda9c43062e',
'info_dict': {
'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4',
'ext': 'mp4',
'title': 'The Ol Raise and Switch!',
'description': 'md5:7db77f7107f98ba88401d3adc80ff7af',
},
'add_id': ['LimelightMedia'],
}, {
# no data-video-title
'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008',
'info_dict': {
'id': 'dfbaf830d7e54e179837c50c0c6cc0e1',
'ext': 'mp4',
'title': "Pokémon : L'ascension de Darkrai",
'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5',
},
'add_id': ['LimelightMedia'],
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2',
'only_matching': True,
}, {
'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/',
'only_matching': True,
}, {
'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id or display_id)
video_data = extract_attributes(self._search_regex(
r'(<[^>]+data-video-id="{}"[^>]*>)'.format(video_id if video_id else '[a-z0-9]{32}'),
webpage, 'video data element'))
video_id = video_data['data-video-id']
title = video_data.get('data-video-title') or self._html_search_meta(
'pkm-title', webpage, ' title', default=None) or self._search_regex(
r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title')
return {
'_type': 'url_transparent',
'id': video_id,
'url': f'limelight:media:{video_id}',
'title': title,
'description': video_data.get('data-video-summary'),
'thumbnail': video_data.get('data-video-poster'),
'series': 'Pokémon',
'season_number': int_or_none(video_data.get('data-video-season')),
'episode': title,
'episode_number': int_or_none(video_data.get('data-video-episode')),
'ie_key': 'LimelightMedia',
}
class PokemonWatchIE(InfoExtractor):
_VALID_URL = r'https?://watch\.pokemon\.com/[a-z]{2}-[a-z]{2}/(?:#/)?player(?:\.html)?\?id=(?P<id>[a-z0-9]{32})'
_API_URL = 'https://www.pokemon.com/api/pokemontv/v2/channels/{0:}'
_TESTS = [{
'url': 'https://watch.pokemon.com/en-us/player.html?id=8309a40969894a8e8d5bc1311e9c5667',
'md5': '62833938a31e61ab49ada92f524c42ff',
'info_dict': {
'id': '8309a40969894a8e8d5bc1311e9c5667',
'ext': 'mp4',
'title': 'Lillier and the Staff!',
'description': 'md5:338841b8c21b283d24bdc9b568849f04',
},
}, {
'url': 'https://watch.pokemon.com/en-us/#/player?id=3fe7752ba09141f0b0f7756d1981c6b2',
'only_matching': True,
}, {
'url': 'https://watch.pokemon.com/de-de/player.html?id=b3c402e111a4459eb47e12160ab0ba07',
'only_matching': True,
}]
def _extract_media(self, channel_array, video_id):
for channel in channel_array:
for media in channel.get('media'):
if media.get('id') == video_id:
return media
return None
def _real_extract(self, url):
video_id = self._match_id(url)
info = {
'_type': 'url',
'id': video_id,
'url': f'limelight:media:{video_id}',
'ie_key': 'LimelightMedia',
}
# API call can be avoided entirely if we are listing formats
if self.get_param('listformats', False):
return info
webpage = self._download_webpage(url, video_id)
build_vars = self._parse_json(self._search_regex(
r'(?s)buildVars\s*=\s*({.*?})', webpage, 'build vars'),
video_id, transform_source=js_to_json)
region = build_vars.get('region')
channel_array = self._download_json(self._API_URL.format(region), video_id)
video_data = self._extract_media(channel_array, video_id)
if video_data is None:
raise ExtractorError(
f'Video {video_id} does not exist', expected=True)
info['_type'] = 'url_transparent'
images = video_data.get('images')
return merge_dicts(info, {
'title': video_data.get('title'),
'description': video_data.get('description'),
'thumbnail': images.get('medium') or images.get('small'),
'series': 'Pokémon',
'season_number': int_or_none(video_data.get('season')),
'episode': video_data.get('title'),
'episode_number': int_or_none(video_data.get('episode')),
})

View File

@ -199,8 +199,9 @@ class SonyLIVSeriesIE(InfoExtractor):
},
}]
_API_BASE = 'https://apiv2.sonyliv.com/AGL'
_SORT_ORDERS = ('asc', 'desc')
def _entries(self, show_id):
def _entries(self, show_id, sort_order):
headers = {
'Accept': 'application/json, text/plain, */*',
'Referer': 'https://www.sonyliv.com',
@ -215,6 +216,9 @@ class SonyLIVSeriesIE(InfoExtractor):
'from': '0',
'to': '49',
}), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id'])))
if sort_order == 'desc':
seasons = reversed(seasons)
for season in seasons:
season_id = str(season['id'])
note = traverse_obj(season, ('metadata', 'title', {str})) or 'season'
@ -226,7 +230,7 @@ class SonyLIVSeriesIE(InfoExtractor):
'from': str(cursor),
'to': str(cursor + 99),
'orderBy': 'episodeNumber',
'sortOrder': 'asc',
'sortOrder': sort_order,
}), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id'])))
if not episodes:
break
@ -237,4 +241,10 @@ class SonyLIVSeriesIE(InfoExtractor):
def _real_extract(self, url):
show_id = self._match_id(url)
return self.playlist_result(self._entries(show_id), playlist_id=show_id)
sort_order = self._configuration_arg('sort_order', [self._SORT_ORDERS[0]])[0]
if sort_order not in self._SORT_ORDERS:
raise ValueError(
f'Invalid sort order "{sort_order}". Allowed values are: {", ".join(self._SORT_ORDERS)}')
return self.playlist_result(self._entries(show_id, sort_order), playlist_id=show_id)

113
yt_dlp/extractor/uliza.py Normal file
View File

@ -0,0 +1,113 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
make_archive_id,
parse_qs,
time_seconds,
)
from ..utils.traversal import traverse_obj
class UlizaPlayerIE(InfoExtractor):
_VALID_URL = r'https://player-api\.p\.uliza\.jp/v1/players/[^?#]+\?(?:[^#]*&)?name=(?P<id>[^#&]+)'
_TESTS = [{
'url': 'https://player-api.p.uliza.jp/v1/players/timeshift-disabled/pia/admin?type=normal&playerobjectname=ulizaPlayer&name=livestream01_dvr&repeatable=true',
'info_dict': {
'id': '88f3109a-f503-4d0f-a9f7-9f39ac745d84',
'ext': 'mp4',
'title': '88f3109a-f503-4d0f-a9f7-9f39ac745d84',
'live_status': 'was_live',
'_old_archive_ids': ['piaulizaportal 88f3109a-f503-4d0f-a9f7-9f39ac745d84'],
},
}, {
'url': 'https://player-api.p.uliza.jp/v1/players/uliza_jp_gallery_normal/promotion/admin?type=presentation&name=cookings&targetid=player1',
'info_dict': {
'id': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800',
'ext': 'mp4',
'title': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800',
'live_status': 'not_live',
'_old_archive_ids': ['piaulizaportal ae350126-5e22-4a7f-a8ac-8d0fd448b800'],
},
}, {
'url': 'https://player-api.p.uliza.jp/v1/players/default-player/pia/admin?type=normal&name=pia_movie_uliza_fix&targetid=ulizahtml5&repeatable=true',
'info_dict': {
'id': '0644ecc8-e354-41b4-b957-3b08a2d63df1',
'ext': 'mp4',
'title': '0644ecc8-e354-41b4-b957-3b08a2d63df1',
'live_status': 'not_live',
'_old_archive_ids': ['piaulizaportal 0644ecc8-e354-41b4-b957-3b08a2d63df1'],
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
player_data = self._download_webpage(
url, display_id, headers={'Referer': 'https://player-api.p.uliza.jp/'},
note='Fetching player data', errnote='Unable to fetch player data')
m3u8_url = self._search_regex(
r'["\'](https://vms-api\.p\.uliza\.jp/v1/prog-index\.m3u8[^"\']+)', player_data, 'm3u8 url')
video_id = parse_qs(m3u8_url).get('ss', [display_id])[0]
formats = self._extract_m3u8_formats(m3u8_url, video_id)
m3u8_type = self._search_regex(
r'/hls/(dvr|video)/', traverse_obj(formats, (0, 'url')), 'm3u8 type', default=None)
return {
'id': video_id,
'title': video_id,
'formats': formats,
'live_status': {
'video': 'is_live',
'dvr': 'was_live', # short-term archives
}.get(m3u8_type, 'not_live'), # VOD or long-term archives
'_old_archive_ids': [make_archive_id('PIAULIZAPortal', video_id)],
}
class UlizaPortalIE(InfoExtractor):
IE_DESC = 'ulizaportal.jp'
_VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{
'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44',
'info_dict': {
'id': 'ae350126-5e22-4a7f-a8ac-8d0fd448b800',
'display_id': '005f18b7-e810-5618-cb82-0987c5755d44',
'title': 'プレゼンテーションプレイヤーのサンプル',
'live_status': 'not_live',
'_old_archive_ids': ['piaulizaportal ae350126-5e22-4a7f-a8ac-8d0fd448b800'],
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}, {
'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1',
'info_dict': {
'id': '0644ecc8-e354-41b4-b957-3b08a2d63df1',
'display_id': '005e1b23-fe93-5780-19a0-98e917cc4b7d',
'title': '【確認用】視聴サンプルページULIZA',
'live_status': 'not_live',
'_old_archive_ids': ['piaulizaportal 0644ecc8-e354-41b4-b957-3b08a2d63df1'],
},
'params': {
'skip_download': True,
'ignore_no_formats_error': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0)))
if expires and expires <= time_seconds():
raise ExtractorError('The link is expired', video_id=video_id, expected=True)
webpage = self._download_webpage(url, video_id)
player_data_url = self._search_regex(
r'<script [^>]*\bsrc="(https://player-api\.p\.uliza\.jp/v1/players/[^"]+)"',
webpage, 'player data url')
return self.url_result(
player_data_url, UlizaPlayerIE, url_transparent=True,
display_id=video_id, video_title=self._html_extract_title(webpage))

View File

@ -1,189 +0,0 @@
import functools
import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
int_or_none,
parse_duration,
qualities,
remove_start,
strip_or_none,
)
class VeohIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|videos|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
_TESTS = [{
'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
'md5': '620e68e6a3cff80086df3348426c9ca3',
'info_dict': {
'id': 'v56314296nk7Zdmz3',
'ext': 'mp4',
'title': 'Straight Backs Are Stronger',
'description': 'md5:203f976279939a6dc664d4001e13f5f4',
'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th56314296\\.jpg(\\?.*)?',
'uploader': 'LUMOback',
'duration': 46,
'view_count': int,
'average_rating': int,
'comment_count': int,
'age_limit': 0,
'categories': ['technology_and_gaming'],
'tags': ['posture', 'posture', 'sensor', 'back', 'pain', 'wearable', 'tech', 'lumo'],
},
}, {
'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3',
'only_matching': True,
}, {
'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
'info_dict': {
'id': '27701988',
'ext': 'mp4',
'title': 'Chile workers cover up to avoid skin damage',
'description': 'md5:2bd151625a60a32822873efc246ba20d',
'uploader': 'afp-news',
'duration': 123,
},
'skip': 'This video has been deleted.',
}, {
'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
'md5': '4fde7b9e33577bab2f2f8f260e30e979',
'note': 'Embedded ooyala video',
'info_dict': {
'id': '69525809',
'ext': 'mp4',
'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
'uploader': 'newsy-videos',
},
'skip': 'This video has been deleted.',
}, {
'url': 'http://www.veoh.com/watch/e152215AJxZktGS',
'only_matching': True,
}, {
'url': 'https://www.veoh.com/videos/v16374379WA437rMH',
'md5': 'cceb73f3909063d64f4b93d4defca1b3',
'info_dict': {
'id': 'v16374379WA437rMH',
'ext': 'mp4',
'title': 'Phantasmagoria 2, pt. 1-3',
'description': 'Phantasmagoria: a Puzzle of Flesh',
'thumbnail': 're:https://fcache\\.veoh\\.com/file/f/th16374379\\.jpg(\\?.*)?',
'uploader': 'davidspackage',
'duration': 968,
'view_count': int,
'average_rating': int,
'comment_count': int,
'age_limit': 18,
'categories': ['technology_and_gaming', 'gaming'],
'tags': ['puzzle', 'of', 'flesh'],
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
metadata = self._download_json(
'https://www.veoh.com/watch/getVideo/' + video_id,
video_id)
video = metadata['video']
title = video['title']
thumbnail_url = None
q = qualities(['Regular', 'HQ'])
formats = []
for f_id, f_url in video.get('src', {}).items():
if not f_url:
continue
if f_id == 'poster':
thumbnail_url = f_url
else:
formats.append({
'format_id': f_id,
'quality': q(f_id),
'url': f_url,
})
categories = metadata.get('categoryPath')
if not categories:
category = remove_start(strip_or_none(video.get('category')), 'category_')
categories = [category] if category else None
tags = video.get('tags')
return {
'id': video_id,
'title': title,
'description': video.get('description'),
'thumbnail': thumbnail_url,
'uploader': video.get('author', {}).get('nickname'),
'duration': int_or_none(video.get('lengthBySec')) or parse_duration(video.get('length')),
'view_count': int_or_none(video.get('views')),
'formats': formats,
'average_rating': int_or_none(video.get('rating')),
'comment_count': int_or_none(video.get('numOfComments')),
'age_limit': 18 if video.get('contentRatingId') == 2 else 0,
'categories': categories,
'tags': tags.split(', ') if tags else None,
}
class VeohUserIE(VeohIE): # XXX: Do not subclass from concrete IE
_VALID_URL = r'https?://(?:www\.)?veoh\.com/users/(?P<id>[\w-]+)'
IE_NAME = 'veoh:user'
_TESTS = [
{
'url': 'https://www.veoh.com/users/valentinazoe',
'info_dict': {
'id': 'valentinazoe',
'title': 'valentinazoe (Uploads)',
},
'playlist_mincount': 75,
},
{
'url': 'https://www.veoh.com/users/PiensaLibre',
'info_dict': {
'id': 'PiensaLibre',
'title': 'PiensaLibre (Uploads)',
},
'playlist_mincount': 2,
}]
_PAGE_SIZE = 16
def _fetch_page(self, uploader, page):
response = self._download_json(
'https://www.veoh.com/users/published/videos', uploader,
note=f'Downloading videos page {page + 1}',
headers={
'x-csrf-token': self._TOKEN,
'content-type': 'application/json;charset=UTF-8',
},
data=json.dumps({
'username': uploader,
'maxResults': self._PAGE_SIZE,
'page': page + 1,
'requestName': 'userPage',
}).encode())
if not response.get('success'):
raise ExtractorError(response['message'])
for video in response['videos']:
yield self.url_result(f'https://www.veoh.com/watch/{video["permalinkId"]}', VeohIE,
video['permalinkId'], video.get('title'))
def _real_initialize(self):
webpage = self._download_webpage(
'https://www.veoh.com', None, note='Downloading authorization token')
self._TOKEN = self._search_regex(
r'csrfToken:\s*(["\'])(?P<token>[0-9a-zA-Z]{40})\1', webpage,
'request token', group='token')
def _real_extract(self, url):
uploader = self._match_id(url)
return self.playlist_result(OnDemandPagedList(
functools.partial(self._fetch_page, uploader),
self._PAGE_SIZE), uploader, f'{uploader} (Uploads)')

View File

@ -5087,7 +5087,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
def _rich_entries(self, rich_grid_renderer):
renderer = traverse_obj(
rich_grid_renderer,
('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel'), any)) or {}
('content', ('videoRenderer', 'reelItemRenderer', 'playlistRenderer', 'shortsLockupViewModel', 'lockupViewModel'), any)) or {}
video_id = renderer.get('videoId')
if video_id:
yield self._extract_video(renderer)
@ -5114,6 +5114,18 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
})),
thumbnails=self._extract_thumbnails(renderer, 'thumbnail', final_key='sources'))
return
# lockupViewModel extraction
content_id = renderer.get('contentId')
if content_id and renderer.get('contentType') == 'LOCKUP_CONTENT_TYPE_PODCAST':
yield self.url_result(
f'https://www.youtube.com/playlist?list={content_id}',
ie=YoutubeTabIE, video_id=content_id,
**traverse_obj(renderer, {
'title': ('metadata', 'lockupMetadataViewModel', 'title', 'content', {str}),
}),
thumbnails=self._extract_thumbnails(renderer, (
'contentImage', 'collectionThumbnailViewModel', 'primaryThumbnail', 'thumbnailViewModel', 'image'), final_key='sources'))
return
def _video_entry(self, video_renderer):
video_id = video_renderer.get('videoId')
@ -6706,22 +6718,22 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
},
'playlist_count': 0,
}, {
# Podcasts tab, with rich entry playlistRenderers
# Podcasts tab, with rich entry lockupViewModel
'url': 'https://www.youtube.com/@99percentinvisiblepodcast/podcasts',
'info_dict': {
'id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw',
'channel_id': 'UCVMF2HD4ZgC0QHpU9Yq5Xrw',
'uploader_url': 'https://www.youtube.com/@99percentinvisiblepodcast',
'description': 'md5:3a0ed38f1ad42a68ef0428c04a15695c',
'title': '99 Percent Invisible - Podcasts',
'uploader': '99 Percent Invisible',
'title': '99% Invisible - Podcasts',
'uploader': '99% Invisible',
'channel_follower_count': int,
'channel_url': 'https://www.youtube.com/channel/UCVMF2HD4ZgC0QHpU9Yq5Xrw',
'tags': [],
'channel': '99 Percent Invisible',
'channel': '99% Invisible',
'uploader_id': '@99percentinvisiblepodcast',
},
'playlist_count': 0,
'playlist_count': 5,
}, {
# Releases tab, with rich entry playlistRenderers (same as Podcasts tab)
'url': 'https://www.youtube.com/@AHimitsu/releases',

348
yt_dlp/parsing.py Normal file
View File

@ -0,0 +1,348 @@
import collections
import contextlib
import itertools
import re
from html.parser import HTMLParser
from .compat import compat_HTMLParseError
from .utils import orderedSet
class HTMLIgnoreRanges:
"""check if an offset is within CDATA content elements (script, style) or XML comments
note:
* given offsets must be in increasing order
* no detection of nested constructs (e.g. comments within script tags)
usage:
ranges = HTMLIgnoreRanges(html)
if offset in ranges:
...
"""
REGEX = re.compile(r'<!--|--!?>|</?\s*(?:script|style)\b[^>]*>', flags=re.IGNORECASE)
def __init__(self, html):
self.html = html
self._last_match = None
self._final = False
def __contains__(self, offset):
assert isinstance(offset, int)
if not self._final and (self._last_match is None or offset >= self._last_match.end()):
match = self.REGEX.search(self.html, offset)
if match:
self._last_match = match
else:
self._final = True
if self._last_match is None:
return False
match_string = self._last_match.group()
if match_string.startswith('</') or match_string in ('-->', '--!>'):
return offset < self._last_match.start()
return offset >= self._last_match.end()
class HTMLTagParser(HTMLParser):
"""HTML parser which returns found elements as instances of 'Tag'
when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
usage:
parser = HTMLTagParser()
for tag_obj in parser.taglist(html):
tag_obj.text_and_html()
"""
STRICT = False
ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
VOID_TAGS = {
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
}
class Tag:
__slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'
def __init__(self, name, *, string='', attrs=()):
self.name = name
self.string = string
self.attrs = tuple(attrs)
self._openrange = None
self._closerange = None
def __str__(self):
return self.name
def __repr__(self):
return f'{self.__class__.__name__}({str(self)!r})'
def __eq__(self, other):
return self.name == other
def openrange(self, offset, startlen=0):
if isinstance(offset, slice):
self._openrange = offset
else:
self._openrange = slice(offset, offset + startlen)
def closerange(self, offset, stoplen=0):
if isinstance(offset, slice):
self._closerange = offset
else:
self._closerange = slice(offset, offset + stoplen)
def opentag(self):
return self.string[self._openrange] if self._openrange else ''
def html(self):
if not self._openrange:
return ''
if self._closerange:
return self.string[self._openrange.start:self._closerange.stop]
return self.string[self._openrange]
def text(self):
if self._openrange and self._closerange:
return self.string[self._openrange.stop:self._closerange.start]
return ''
def text_and_html(self):
return self.text(), self.html()
class AbortException(Exception):
pass
def __init__(self):
self.tagstack = collections.deque()
self._nestedtags = [[]]
super().__init__()
self._offset = self.offset
def predicate(self, tag, attrs):
""" return True for every encountered opening tag that should be processed """
return True
def callback(self, tag_obj):
""" this will be called when the requested tag is closed """
def reset(self):
super().reset()
self.tagstack.clear()
def taglist(self, data, reset=True, depth_first=False):
""" parse data and return found tag objects
@param data: html string
@param reset: reset state
@param depth_first: return order: as opened (False), as closed (True), nested (None)
@return: list of Tag objects
"""
def flatten(_list, first=True):
rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
for item in rlist:
if isinstance(item, list):
yield from flatten(item, first=False)
else:
yield item
if reset:
self.reset()
with contextlib.suppress(HTMLTagParser.AbortException):
self.feed(data)
if self.STRICT and self.tagstack:
orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
raise compat_HTMLParseError(f'unclosed tag {orphans}')
taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
self._nestedtags = [[]]
return taglist
def updatepos(self, i, j):
offset = self._offset = super().updatepos(i, j)
return offset
def handle_starttag(self, tag, attrs):
try:
# we use internal variable for performance reasons
tag_text = getattr(self, '_HTMLParser__starttag_text')
except AttributeError:
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
tag_obj = tag
tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS)
if self.predicate(tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text))
nesting = [tag_obj]
self._nestedtags[-1].append(nesting)
if tag_is_open:
self._nestedtags.append(nesting)
else:
self.callback(tag_obj)
if tag_is_open:
self.tagstack.appendleft(tag_obj)
handle_startendtag = handle_starttag
def handle_endtag(self, tag):
if '<' in tag:
if self.STRICT:
raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
tag = tag[:tag.index('<')]
try:
idx = self.tagstack.index(tag)
if self.STRICT and idx:
open_tags = ''.join(f'</{tag}>' for tag in itertools.islice(self.tagstack, idx))
raise compat_HTMLParseError(
f'malnested closing tag {tag!r}, expected after {open_tags!r}')
tag_obj = self.tagstack[idx]
self.tagstack.remove(tag)
if isinstance(tag_obj, self.Tag):
tag_obj.closerange(slice(self._offset, self.rawdata.find('>', self._offset) + 1))
self._nestedtags.pop()
self.callback(tag_obj)
except ValueError as exc:
if isinstance(exc, compat_HTMLParseError):
raise
if self.STRICT:
raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc
class MatchingElementParser(HTMLTagParser):
""" optimized version of HTMLTagParser
"""
def __init__(self, matchfunc):
super().__init__()
self.matchfunc = matchfunc
self.found_none = True
def reset(self):
super().reset()
self.found_none = True
def callback(self, tag_obj):
raise self.AbortException()
def predicate(self, tag, attrs):
if self.found_none and self.matchfunc(tag, attrs):
self.found_none = False
return True
return False
@staticmethod
def class_value_regex(class_name):
return rf'[\w\s\-]*(?<![\w\-]){re.escape(class_name)}(?![\w\-])[\w\s\-]*'
@staticmethod
def matching_tag_regex(tag, attribute, value_regex, escape=True):
if isinstance(value_regex, re.Pattern):
value_regex = value_regex.pattern
elif escape:
value_regex = re.escape(value_regex)
return rf'''(?x)
<(?i:{tag})
(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
'''
@classmethod
def iter_tags(cls, regex, html, *, matchfunc):
ignored = HTMLIgnoreRanges(html)
parser = cls(matchfunc)
for match in re.finditer(regex, html):
if match.start() not in ignored:
yield from parser.taglist(html[match.start():], reset=True)
@classmethod
def tags_by_name(cls, tag, html):
def matchfunc(tag_str, _attrs):
return tag_str == tag
tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod
def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
def matchfunc(_tag_str, attrs):
return any(attr == attribute and re.fullmatch(value, value_str)
for attr, value_str in attrs)
tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod
def extract_attributes(cls, html):
attr_dict = {}
def matchfunc(_tag, attrs):
attr_dict.update(attrs)
raise cls.AbortException()
with contextlib.suppress(cls.AbortException):
cls(matchfunc).feed(html)
return attr_dict
@classmethod
def get_elements_text_and_html_by_tag(cls, tag, html):
return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]
@classmethod
def get_element_text_and_html_by_tag(cls, tag, html):
tag = next(cls.tags_by_name(tag, html), None)
return tag and tag.text_and_html()
@classmethod
def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_elements_by_attribute(cls, *args, **kwargs):
return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_elements_html_by_attribute(cls, *args, **kwargs):
return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_element_by_attribute(cls, *args, **kwargs):
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
return tag and tag.text()
@classmethod
def get_element_html_by_attribute(cls, *args, **kwargs):
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
return tag and tag.html()
@classmethod
def get_elements_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.text() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_elements_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.html() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_elements_text_and_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.text_and_html() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_element_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
return tag and tag.html()
@classmethod
def get_element_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
return tag and tag.text()

View File

@ -408,17 +408,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
pass
def handle_starttag(self, tag, _):
self.tagstack.append(tag)
self.tagstack.appendleft(tag)
def handle_endtag(self, tag):
if not self.tagstack:
raise compat_HTMLParseError('no tags in the stack')
while self.tagstack:
inner_tag = self.tagstack.pop()
if inner_tag == tag:
break
else:
raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
with contextlib.suppress(ValueError):
self.tagstack.remove(tag)
if not self.tagstack:
raise self.HTMLBreakOnClosingTagException
@ -452,6 +448,8 @@ def get_element_text_and_html_by_tag(tag, html):
next_closing_tag_end = next_closing_tag_start + len(closing_tag)
try:
parser.feed(html[offset:offset + next_closing_tag_end])
if tag not in parser.tagstack:
raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
offset += next_closing_tag_end
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
return html[content_start:offset + next_closing_tag_start], \