Compare commits

...

19 Commits

Author SHA1 Message Date
MMM
5ae4d54ce4
Merge a91d9e1084 into eb15fd5a32 2024-11-17 21:33:20 +05:30
krichbanana
eb15fd5a32
[ie/kenh14] Add extractor (#3996)
Closes #3937
Authored by: krichbanana, pzhlkj6612

Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
2024-11-17 14:12:26 +00:00
sepro
7cecd299e4
[ie/chaturbate] Don't break embed detection (#11565)
Bugfix for 720b3dc453

Authored by: seproDev
2024-11-17 13:32:12 +01:00
flashdagger
a91d9e1084
[parsing] support comment end tag '--!>' as suggested by github-advanced-security bot 2023-11-13 07:19:42 +01:00
flashdagger
c34166d7c8
[parsing] support uppercase SCRIPT tags as suggested by github-advanced-security bot 2023-11-13 06:54:28 +01:00
flashdagger
b35550248a
Merge branch 'master' into html-parsing-relaxed 2023-11-13 06:40:03 +01:00
Marcel
7a9dd3d35f
[parsing] inline tag_obj.closerange() 2023-03-18 18:38:49 +01:00
Marcel
8d87bb4d91
[parsing] unify tag nesting 2023-03-18 18:38:48 +01:00
Marcel
65f91148fc
[parsing] search for case-insensitive tag names 2023-03-18 18:38:48 +01:00
Marcel
6169b3eca8
[parsing] replace HTMLCommentRanges with HTMLIgnoreRanges
* ignore matches within CDATA elements and comments
2023-03-18 18:38:47 +01:00
Marcel
29278a3323
[parsing] fix return value 2023-03-18 18:38:46 +01:00
Marcel
7a67a2028f
[parsing] tweak tag regex 2023-03-18 18:38:46 +01:00
Marcel
dbf350c122
[parsing] return unclosed matched tags 2023-03-18 18:38:45 +01:00
Marcel
8451074b50
[parsing] fix: don't push unmatched void tags onto queue 2023-03-18 18:38:45 +01:00
Marcel
176a156c65
[parsing] rework interface, implemented all get_element(s) functions + extract_attributes() as MatchingElementParser class methods and improve performance 2023-03-18 18:38:44 +01:00
Marcel
e092ba9922
[test] rollback test_utils.py and add related tests to test_parsing.py 2023-03-18 18:38:44 +01:00
Marcel
5e3894df3f
[parsing] add new module containing various HTML parser classes as replacement for utils.get_html_... functions
* performance is mostly better for large HTML data and on PyPy
2023-03-18 18:38:43 +01:00
Marcel
af03fa4542
[utils] more forgiving html parsing + unit tests 2023-03-18 18:38:43 +01:00
Marcel
da0d84258b
[test/test_utils] refactor test_get_element_text_and_html_by_tag() 2023-03-18 18:38:37 +01:00
6 changed files with 878 additions and 9 deletions

359
test/test_parsing.py Normal file
View File

@ -0,0 +1,359 @@
import textwrap
import unittest
from yt_dlp.compat import compat_HTMLParseError
from yt_dlp.parsing import (
MatchingElementParser,
HTMLIgnoreRanges,
HTMLTagParser,
)
extract_attributes = MatchingElementParser.extract_attributes
get_element_by_attribute = MatchingElementParser.get_element_by_attribute
get_element_by_class = MatchingElementParser.get_element_by_class
get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute
get_element_html_by_class = MatchingElementParser.get_element_html_by_class
get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag
get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute
get_elements_by_class = MatchingElementParser.get_elements_by_class
get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute
get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class
get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute
get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag
class TestParsing(unittest.TestCase):
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'}) # XML
self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'})
self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'}) # HTML 3.2
self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'}) # HTML 4.0
self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
self.assertEqual(extract_attributes('<e x >'), {'x': None})
self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'})
# "Narrow" Python builds don't support unicode code points outside BMP.
try:
chr(0x10000)
supports_outside_bmp = True
except ValueError:
supports_outside_bmp = False
if supports_outside_bmp:
self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
# Malformed HTML should not break attributes extraction on older Python
self.assertEqual(extract_attributes('<mal"formed/>'), {})
GET_ELEMENT_BY_CLASS_TEST_STRING = '''
<span class="foo bar">nice</span>
<div class="foo bar">also nice</div>
'''
def test_get_element_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_class('foo', html), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
def test_get_element_html_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_class('foo', html),
'<span class="foo bar">nice</span>')
self.assertEqual(get_element_by_class('no-such-class', html), None)
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
<div itemprop="author" itemscope>foo</div>
'''
def test_get_element_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice')
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
def test_get_element_html_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html),
'<span class="foo bar">nice</span>')
self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
<span class="foo bar">nice</span>
<span class="foo bar">also nice</span>
'''
GET_ELEMENTS_BY_CLASS_RES = [
'<span class="foo bar">nice</span>',
'<span class="foo bar">also nice</span>'
]
def test_get_elements_by_class(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_class('no-such-class', html), [])
def test_get_elements_html_by_class(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
def test_get_elements_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html),
self.GET_ELEMENTS_BY_CLASS_RES)
self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
def test_get_elements_text_and_html_by_attribute(self):
html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
self.assertEqual(
get_elements_text_and_html_by_attribute('class', 'foo bar', html),
list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute(
'class', 'foo', '<a class="foo">nice</a><span class="foo">not nice</span>', tag='a'),
[('nice', '<a class="foo">nice</a>')])
def test_get_element_text_and_html_by_tag(self):
get_element_by_tag_test_string = '''
random text lorem ipsum</p>
<div>
this should be returned
<span>this should also be returned</span>
<div>
this should also be returned
</div>
closing tag above should not trick, so this should also be returned
</div>
but this text should not be returned
'''
html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
get_element_by_tag_res_innerspan_html = html.strip()[78:119]
get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
self.assertEqual(
get_element_text_and_html_by_tag('div', html),
(get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
self.assertIsNone(get_element_text_and_html_by_tag('article', html))
def test_get_elements_text_and_html_by_tag(self):
class StrictParser(MatchingElementParser):
STRICT = True
test_string = '''
<img src="a.png">
<img src="b.png" />
<span>ignore</span>
'''
items = get_elements_text_and_html_by_tag('img', test_string)
self.assertEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
self.assertEqual(
StrictParser.get_element_text_and_html_by_tag('use', '<use><img></use>'),
('<img>', '<use><img></use>'))
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner text'
malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
commented_html = '<!--<div>inner comment</div>-->'
outerdiv_html = f'<div>{malnested_elements}</div>'
html = f'{commented_html}{outerdiv_html}'
self.assertEqual(
get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_a', html),
(f'<malnested_b>{inner_text}',
f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
self.assertEqual(
get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}</malnested_a>',
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
self.assertEqual(
get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
# ignore case on tags
ci_html = f'<SpAn>{html}</sPaN>'
self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html))
def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser):
STRICT = True
parser = StrictTagParser()
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
parser.taglist('</p>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
parser.taglist('<div><p>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
parser.taglist('<div><p></div></p>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
parser.taglist('<div><p>/p></div>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
parser.taglist('<div><p></p<< </div>', reset=True)
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
parser.taglist('<img>must be empty</img>', reset=True)
def test_relaxed_html_parsing(self):
Tag = HTMLTagParser.Tag
parser = HTMLTagParser()
self.assertEqual(parser.taglist('</p>', reset=True), [])
tags = parser.taglist('<div><p>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('', '<div>'))
self.assertEqual(tags[1].text_and_html(), ('', '<p>'))
tags = parser.taglist('<div><p></div></p>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('<p>', '<div><p></div>'))
self.assertEqual(tags[1].text_and_html(), ('</div>', '<p></div></p>'))
tags = parser.taglist('<div><p>/p></div>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(), ('<p>/p>', '<div><p>/p></div>'))
self.assertEqual(tags[1].text_and_html(), ('', '<p>'))
tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[0].text_and_html(),
('<p>paragraph</p<ignored>', '<div><p>paragraph</p<ignored></div>'))
self.assertEqual(tags[1].text_and_html(), ('paragraph', '<p>paragraph</p<ignored>'))
tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
self.assertEqual(tags, [Tag('img')])
self.assertEqual(tags[0].text_and_html(), ('', '<img width="300px">'))
def test_compliant_html_parsing(self):
# certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
Tag = HTMLTagParser.Tag
html = '''
no error without closing tag: <img>
self closing is ok: <img />
'''
parser = HTMLTagParser()
tags = parser.taglist(html, reset=True)
self.assertEqual(tags, [Tag('img'), Tag('img')])
# don't get fooled by '>' in attributes
html = '''<img greater_a='1>0' greater_b="1>0">'''
tags = parser.taglist(html, reset=True)
self.assertEqual(tags[0].text_and_html(), ('', html))
def test_tag_return_order(self):
Tag = HTMLTagParser.Tag
html = '''
<t0>
<t1>
<t2>
<t3 /> <t4 />
</t2>
</t1>
<t5>
<t6 />
</t5>
</t0>
<t7>
<t8 />
</t7>
'''
parser = HTMLTagParser()
tags = parser.taglist(html, reset=True)
self.assertEqual(
str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
tags = parser.taglist(html, reset=True, depth_first=True)
self.assertEqual(
str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
# return tags in nested order
tags = parser.taglist(html, reset=True, depth_first=None)
self.assertEqual(
str(tags), str([
[Tag('t0'),
[Tag('t1'),
[Tag('t2'), [Tag('t3')], [Tag('t4')]]],
[Tag('t5'), [Tag('t6')]]],
[Tag('t7'), [Tag('t8')]]]))
def test_html_ignored_ranges(self):
def mark_comments(_string, char='^', nochar='-'):
cmts = HTMLIgnoreRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
html_string = '''
no comments in this line
---------------------------------------------------------------------
<!-- whole line represents a comment -->
----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
before <!-- comment --> after
-----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
this is a leftover comment --> <!-- a new comment without closing
^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
here is <!-- a comment --> and <!-- another comment --!> end
----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
<script> ignore here </script> <SCRIPT> and here </SCRIPT>
--------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
'''
lines = textwrap.dedent(html_string).strip().splitlines()
for line, marker in zip(lines[0::2], lines[1::2]):
self.assertEqual((line, mark_comments(line)), (line, marker))
# yet we must be able to match script elements
test_string = '''<script type="text/javascript">var foo = 'bar';</script>'''
items = get_element_text_and_html_by_tag('script', test_string)
self.assertEqual(items, ("var foo = 'bar';", test_string))

View File

@ -946,6 +946,10 @@ from .kaltura import KalturaIE
from .kankanews import KankaNewsIE from .kankanews import KankaNewsIE
from .karaoketv import KaraoketvIE from .karaoketv import KaraoketvIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .kenh14 import (
Kenh14PlaylistIE,
Kenh14VideoIE,
)
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,

View File

@ -79,7 +79,7 @@ class ChaturbateIE(InfoExtractor):
'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True), 'formats': self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True),
} }
def _extract_from_webpage(self, video_id, tld): def _extract_from_html(self, video_id, tld):
webpage = self._download_webpage( webpage = self._download_webpage(
f'https://chaturbate.{tld}/{video_id}/', video_id, f'https://chaturbate.{tld}/{video_id}/', video_id,
headers=self.geo_verification_headers(), impersonate=True) headers=self.geo_verification_headers(), impersonate=True)
@ -151,4 +151,4 @@ class ChaturbateIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id, tld = self._match_valid_url(url).group('id', 'tld') video_id, tld = self._match_valid_url(url).group('id', 'tld')
return self._extract_from_api(video_id, tld) or self._extract_from_webpage(video_id, tld) return self._extract_from_api(video_id, tld) or self._extract_from_html(video_id, tld)

160
yt_dlp/extractor/kenh14.py Normal file
View File

@ -0,0 +1,160 @@
from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
get_element_by_class,
get_element_html_by_attribute,
get_elements_html_by_class,
int_or_none,
parse_duration,
parse_iso8601,
remove_start,
strip_or_none,
unescapeHTML,
update_url,
url_or_none,
)
from ..utils.traversal import traverse_obj
class Kenh14VideoIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/(?:video/)?[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/video/mo-hop-iphone-14-pro-max-nguon-unbox-therapy-316173.chn',
'md5': '1ed67f9c3a1e74acf15db69590cf6210',
'info_dict': {
'id': '316173',
'ext': 'mp4',
'title': 'Video mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'description': 'Video mở hộp iPhone 14 Pro MaxVideo mở hộp iPhone 14 Pro Max (Nguồn: Unbox Therapy)',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'uploader': 'Unbox Therapy',
'upload_date': '20220517',
'view_count': int,
'duration': 722.86,
'timestamp': 1652764468,
},
}, {
'url': 'https://video.kenh14.vn/video-316174.chn',
'md5': '2b41877d2afaf4a3f487ceda8e5c7cbd',
'info_dict': {
'id': '316174',
'ext': 'mp4',
'title': 'Khoảnh khắc VĐV nằm gục khóc sau chiến thắng: 7 năm trời Việt Nam mới có HCV kiếm chém nữ, chỉ có 8 tháng để khổ luyện trước khi lên sàn đấu',
'description': 'md5:de86aa22e143e2b277bce8ec9c6f17dc',
'thumbnail': r're:^https?://videothumbs\.mediacdn\.vn/.*\.jpg$',
'tags': [],
'upload_date': '20220517',
'view_count': int,
'duration': 70.04,
'timestamp': 1652766021,
},
}, {
'url': 'https://video.kenh14.vn/0-344740.chn',
'md5': 'b843495d5e728142c8870c09b46df2a9',
'info_dict': {
'id': '344740',
'ext': 'mov',
'title': 'Kỳ Duyên đầy căng thẳng trong buổi ra quân đi Miss Universe, nghi thức tuyên thuệ lần đầu xuất hiện gây nhiều tranh cãi',
'description': 'md5:2a2dbb4a7397169fb21ee68f09160497',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.jpg$',
'tags': ['kỳ duyên', 'Kỳ Duyên tuyên thuệ', 'miss universe'],
'uploader': 'Quang Vũ',
'upload_date': '20241024',
'view_count': int,
'duration': 198.88,
'timestamp': 1729741590,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
attrs = extract_attributes(get_element_html_by_attribute('type', 'VideoStream', webpage) or '')
direct_url = attrs['data-vid']
metadata = self._download_json(
'https://api.kinghub.vn/video/api/v1/detailVideoByGet?FileName={}'.format(
remove_start(direct_url, 'kenh14cdn.com/')), video_id, fatal=False)
formats = [{'url': f'https://{direct_url}', 'format_id': 'http', 'quality': 1}]
subtitles = {}
video_data = self._download_json(
f'https://{direct_url}.json', video_id, note='Downloading video data', fatal=False)
if hls_url := traverse_obj(video_data, ('hls', {url_or_none})):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
hls_url, video_id, m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
if dash_url := traverse_obj(video_data, ('mpd', {url_or_none})):
fmts, subs = self._extract_mpd_formats_and_subtitles(
dash_url, video_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return {
**traverse_obj(metadata, {
'duration': ('duration', {parse_duration}),
'uploader': ('author', {strip_or_none}),
'timestamp': ('uploadtime', {parse_iso8601(delimiter=' ')}),
'view_count': ('views', {int_or_none}),
}),
'id': video_id,
'title': (
traverse_obj(metadata, ('title', {strip_or_none}))
or clean_html(self._og_search_title(webpage))
or clean_html(get_element_by_class('vdbw-title', webpage))),
'formats': formats,
'subtitles': subtitles,
'description': (
clean_html(self._og_search_description(webpage))
or clean_html(get_element_by_class('vdbw-sapo', webpage))),
'thumbnail': (self._og_search_thumbnail(webpage) or attrs.get('data-thumb')),
'tags': traverse_obj(self._html_search_meta('keywords', webpage), (
{lambda x: x.split(';')}, ..., filter)),
}
class Kenh14PlaylistIE(InfoExtractor):
_VALID_URL = r'https?://video\.kenh14\.vn/playlist/[\w-]+-(?P<id>[0-9]+)\.chn'
_TESTS = [{
'url': 'https://video.kenh14.vn/playlist/tran-tinh-naked-love-mua-2-71.chn',
'info_dict': {
'id': '71',
'title': 'Trần Tình (Naked love) mùa 2',
'description': 'md5:e9522339304956dea931722dd72eddb2',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 9,
}, {
'url': 'https://video.kenh14.vn/playlist/0-72.chn',
'info_dict': {
'id': '72',
'title': 'Lau Lại Đầu Từ',
'description': 'Cùng xem xưa và nay có gì khác biệt nhé!',
'thumbnail': r're:^https?://kenh14cdn\.com/.*\.png$',
},
'playlist_count': 6,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
category_detail = get_element_by_class('category-detail', webpage) or ''
embed_info = traverse_obj(
self._yield_json_ld(webpage, playlist_id),
(lambda _, v: v['name'] and v['alternateName'], any)) or {}
return self.playlist_from_matches(
get_elements_html_by_class('video-item', webpage), playlist_id,
(clean_html(get_element_by_class('name', category_detail)) or unescapeHTML(embed_info.get('name'))),
getter=lambda x: 'https://video.kenh14.vn/video/video-{}.chn'.format(extract_attributes(x)['data-id']),
ie=Kenh14VideoIE, playlist_description=(
clean_html(get_element_by_class('description', category_detail))
or unescapeHTML(embed_info.get('alternateName'))),
thumbnail=traverse_obj(
self._og_search_thumbnail(webpage),
({url_or_none}, {update_url(query=None)})))

348
yt_dlp/parsing.py Normal file
View File

@ -0,0 +1,348 @@
import collections
import contextlib
import itertools
import re
from html.parser import HTMLParser
from .compat import compat_HTMLParseError
from .utils import orderedSet
class HTMLIgnoreRanges:
"""check if an offset is within CDATA content elements (script, style) or XML comments
note:
* given offsets must be in increasing order
* no detection of nested constructs (e.g. comments within script tags)
usage:
ranges = HTMLIgnoreRanges(html)
if offset in ranges:
...
"""
REGEX = re.compile(r'<!--|--!?>|</?\s*(?:script|style)\b[^>]*>', flags=re.IGNORECASE)
def __init__(self, html):
self.html = html
self._last_match = None
self._final = False
def __contains__(self, offset):
assert isinstance(offset, int)
if not self._final and (self._last_match is None or offset >= self._last_match.end()):
match = self.REGEX.search(self.html, offset)
if match:
self._last_match = match
else:
self._final = True
if self._last_match is None:
return False
match_string = self._last_match.group()
if match_string.startswith('</') or match_string in ('-->', '--!>'):
return offset < self._last_match.start()
return offset >= self._last_match.end()
class HTMLTagParser(HTMLParser):
"""HTML parser which returns found elements as instances of 'Tag'
when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
usage:
parser = HTMLTagParser()
for tag_obj in parser.taglist(html):
tag_obj.text_and_html()
"""
STRICT = False
ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
VOID_TAGS = {
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
}
class Tag:
__slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'
def __init__(self, name, *, string='', attrs=()):
self.name = name
self.string = string
self.attrs = tuple(attrs)
self._openrange = None
self._closerange = None
def __str__(self):
return self.name
def __repr__(self):
return f'{self.__class__.__name__}({str(self)!r})'
def __eq__(self, other):
return self.name == other
def openrange(self, offset, startlen=0):
if isinstance(offset, slice):
self._openrange = offset
else:
self._openrange = slice(offset, offset + startlen)
def closerange(self, offset, stoplen=0):
if isinstance(offset, slice):
self._closerange = offset
else:
self._closerange = slice(offset, offset + stoplen)
def opentag(self):
return self.string[self._openrange] if self._openrange else ''
def html(self):
if not self._openrange:
return ''
if self._closerange:
return self.string[self._openrange.start:self._closerange.stop]
return self.string[self._openrange]
def text(self):
if self._openrange and self._closerange:
return self.string[self._openrange.stop:self._closerange.start]
return ''
def text_and_html(self):
return self.text(), self.html()
class AbortException(Exception):
pass
def __init__(self):
self.tagstack = collections.deque()
self._nestedtags = [[]]
super().__init__()
self._offset = self.offset
def predicate(self, tag, attrs):
""" return True for every encountered opening tag that should be processed """
return True
def callback(self, tag_obj):
""" this will be called when the requested tag is closed """
def reset(self):
super().reset()
self.tagstack.clear()
def taglist(self, data, reset=True, depth_first=False):
""" parse data and return found tag objects
@param data: html string
@param reset: reset state
@param depth_first: return order: as opened (False), as closed (True), nested (None)
@return: list of Tag objects
"""
def flatten(_list, first=True):
rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
for item in rlist:
if isinstance(item, list):
yield from flatten(item, first=False)
else:
yield item
if reset:
self.reset()
with contextlib.suppress(HTMLTagParser.AbortException):
self.feed(data)
if self.STRICT and self.tagstack:
orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
raise compat_HTMLParseError(f'unclosed tag {orphans}')
taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
self._nestedtags = [[]]
return taglist
def updatepos(self, i, j):
offset = self._offset = super().updatepos(i, j)
return offset
def handle_starttag(self, tag, attrs):
try:
# we use internal variable for performance reasons
tag_text = getattr(self, '_HTMLParser__starttag_text')
except AttributeError:
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
tag_obj = tag
tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS)
if self.predicate(tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text))
nesting = [tag_obj]
self._nestedtags[-1].append(nesting)
if tag_is_open:
self._nestedtags.append(nesting)
else:
self.callback(tag_obj)
if tag_is_open:
self.tagstack.appendleft(tag_obj)
handle_startendtag = handle_starttag
def handle_endtag(self, tag):
if '<' in tag:
if self.STRICT:
raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
tag = tag[:tag.index('<')]
try:
idx = self.tagstack.index(tag)
if self.STRICT and idx:
open_tags = ''.join(f'</{tag}>' for tag in itertools.islice(self.tagstack, idx))
raise compat_HTMLParseError(
f'malnested closing tag {tag!r}, expected after {open_tags!r}')
tag_obj = self.tagstack[idx]
self.tagstack.remove(tag)
if isinstance(tag_obj, self.Tag):
tag_obj.closerange(slice(self._offset, self.rawdata.find('>', self._offset) + 1))
self._nestedtags.pop()
self.callback(tag_obj)
except ValueError as exc:
if isinstance(exc, compat_HTMLParseError):
raise
if self.STRICT:
raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc
class MatchingElementParser(HTMLTagParser):
""" optimized version of HTMLTagParser
"""
def __init__(self, matchfunc):
super().__init__()
self.matchfunc = matchfunc
self.found_none = True
def reset(self):
super().reset()
self.found_none = True
def callback(self, tag_obj):
raise self.AbortException()
def predicate(self, tag, attrs):
if self.found_none and self.matchfunc(tag, attrs):
self.found_none = False
return True
return False
@staticmethod
def class_value_regex(class_name):
return rf'[\w\s\-]*(?<![\w\-]){re.escape(class_name)}(?![\w\-])[\w\s\-]*'
@staticmethod
def matching_tag_regex(tag, attribute, value_regex, escape=True):
if isinstance(value_regex, re.Pattern):
value_regex = value_regex.pattern
elif escape:
value_regex = re.escape(value_regex)
return rf'''(?x)
<(?i:{tag})
(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
'''
@classmethod
def iter_tags(cls, regex, html, *, matchfunc):
ignored = HTMLIgnoreRanges(html)
parser = cls(matchfunc)
for match in re.finditer(regex, html):
if match.start() not in ignored:
yield from parser.taglist(html[match.start():], reset=True)
@classmethod
def tags_by_name(cls, tag, html):
def matchfunc(tag_str, _attrs):
return tag_str == tag
tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod
def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
def matchfunc(_tag_str, attrs):
return any(attr == attribute and re.fullmatch(value, value_str)
for attr, value_str in attrs)
tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod
def extract_attributes(cls, html):
attr_dict = {}
def matchfunc(_tag, attrs):
attr_dict.update(attrs)
raise cls.AbortException()
with contextlib.suppress(cls.AbortException):
cls(matchfunc).feed(html)
return attr_dict
@classmethod
def get_elements_text_and_html_by_tag(cls, tag, html):
return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]
@classmethod
def get_element_text_and_html_by_tag(cls, tag, html):
tag = next(cls.tags_by_name(tag, html), None)
return tag and tag.text_and_html()
@classmethod
def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_elements_by_attribute(cls, *args, **kwargs):
return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_elements_html_by_attribute(cls, *args, **kwargs):
return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_element_by_attribute(cls, *args, **kwargs):
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
return tag and tag.text()
@classmethod
def get_element_html_by_attribute(cls, *args, **kwargs):
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
return tag and tag.html()
@classmethod
def get_elements_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.text() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_elements_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.html() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_elements_text_and_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.text_and_html() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_element_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
return tag and tag.html()
@classmethod
def get_element_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
return tag and tag.text()

View File

@ -408,17 +408,13 @@ class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
pass pass
def handle_starttag(self, tag, _): def handle_starttag(self, tag, _):
self.tagstack.append(tag) self.tagstack.appendleft(tag)
def handle_endtag(self, tag): def handle_endtag(self, tag):
if not self.tagstack: if not self.tagstack:
raise compat_HTMLParseError('no tags in the stack') raise compat_HTMLParseError('no tags in the stack')
while self.tagstack: with contextlib.suppress(ValueError):
inner_tag = self.tagstack.pop() self.tagstack.remove(tag)
if inner_tag == tag:
break
else:
raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
if not self.tagstack: if not self.tagstack:
raise self.HTMLBreakOnClosingTagException raise self.HTMLBreakOnClosingTagException
@ -452,6 +448,8 @@ def get_element_text_and_html_by_tag(tag, html):
next_closing_tag_end = next_closing_tag_start + len(closing_tag) next_closing_tag_end = next_closing_tag_start + len(closing_tag)
try: try:
parser.feed(html[offset:offset + next_closing_tag_end]) parser.feed(html[offset:offset + next_closing_tag_end])
if tag not in parser.tagstack:
raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
offset += next_closing_tag_end offset += next_closing_tag_end
except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException: except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
return html[content_start:offset + next_closing_tag_start], \ return html[content_start:offset + next_closing_tag_start], \