mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2024-09-20 09:21:25 +02:00
Compare commits
No commits in common. "360ecdd6145f304d38ba0735f52b3622d3634808" and "197bcd877c012ee2d5bfc9ae1d63eed08362c62f" have entirely different histories.
360ecdd614
...
197bcd877c
|
@ -1,4 +1,8 @@
|
|||
# flake8: noqa: F401
|
||||
from .tudou import(TudouIE,
|
||||
|
||||
|
||||
)
|
||||
|
||||
from .youtube import ( # Youtube is moved to the top to improve performance
|
||||
YoutubeIE,
|
||||
|
@ -2047,7 +2051,6 @@ from .tubitv import (
|
|||
TubiTvIE,
|
||||
TubiTvShowIE,
|
||||
)
|
||||
from .tudou import TudouIE
|
||||
from .tumblr import TumblrIE
|
||||
from .tunein import (
|
||||
TuneInStationIE,
|
||||
|
|
|
@ -2,63 +2,63 @@ from .common import InfoExtractor
|
|||
|
||||
|
||||
class TudouIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(?P<id>id_[\w=.]+)'
|
||||
_VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(id_[a-zA-Z0-9_=.]+)'
|
||||
_TESTS = [{
|
||||
'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng',
|
||||
'md5': 'to be updated',
|
||||
|
||||
# this code successfully downloaded the .mp4 file, and passed the test, EXCEPT the md5 part.
|
||||
# I moved this code to the extractor folder of the released version of yt-dlp, trying to see if it works properly in there.
|
||||
# IF it worked, then I can calculate the md5 of the first 10kb, and compare the md5.
|
||||
# Unfortunately, it didn't.
|
||||
# I'm guessing it has problem in extracting the video id, but can't prove my guessing.
|
||||
|
||||
# But it does work here, when run 'python test/test_download.py TestDownload.test_Tudou'
|
||||
# That's why it didn't pass the md5 test, because I couldn't download the first 10kb with the released version of yt-dlp.
|
||||
# So there's nothing to compare
|
||||
|
||||
# I'm interested to know if there's another way to download the first 10kb.
|
||||
# Currently it's a paradox to me:
|
||||
# To finalise this code, I need to get the first 10kb, to do the last comparison.
|
||||
# But this code doesn't work in the released yt-dlp, so I can't get the first 10kb.
|
||||
# So how can I.....
|
||||
|
||||
# 'f33b73e7470c45b7d3c4f7d8b34eda14',
|
||||
# this md5, is from the output of this command - 'python test/test_download.py TestDownload.test_Tudou'.
|
||||
# the downloaded file is deleted automatically, not giving me a chance to calculate its md5 manually.
|
||||
'md5': 'failed to get the first 10kb',
|
||||
|
||||
'info_dict': {
|
||||
'id': 'XNjAxNjI2OTU3Ng==',
|
||||
'ext': 'mp4',
|
||||
'title': '阿斯塔意识到哈里杀了人,自己被骗了',
|
||||
'show_name': '外星居民 第一季',
|
||||
'title': '外星居民 第一季 阿斯塔意识到哈里杀了人,自己被骗了-电视剧-高清完整正版视频在线观看-优酷',
|
||||
},
|
||||
# 'skip': 'testing skip function',
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
# About video_id
|
||||
# .get_temp_id return None
|
||||
# ._match_id doesn't work as well
|
||||
# I don't know how to fix this, but line 49 works, it extracts id from the webpage
|
||||
# I think there might be a smarter way, but I'm just not smart enough
|
||||
video_id = self.get_temp_id(url)
|
||||
webpage = self._download_webpage(url, video_id)
|
||||
data = self._search_json(r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data', video_id)
|
||||
|
||||
# print('==========')
|
||||
# print(webpage)
|
||||
# print(data)
|
||||
|
||||
# The json file produced same results as the regex did, but it's much cleaner, thanks for the guide
|
||||
video_id = data['data']['data']['data']['extra']['videoId']
|
||||
videoLongId = str(data['data']['data']['data']['extra']['videoLongId'])
|
||||
title = data['data']['data']['data']['extra']['videoTitle']
|
||||
show_name = data['data']['data']['data']['extra']['showName']
|
||||
|
||||
video_url = 'https://play.tudou.com' + data['config']['url']
|
||||
# About video_url
|
||||
# The video url is not stored in the json file above, instead, the website uses m3u8 scheme
|
||||
# With F12 developer tool, I've locked one request.
|
||||
# Each time I click the button to play the video, the browser will GET a .m3u8 file which contains urls of all clips of that video, in currently selected resolution (in the webpage player).
|
||||
# In Debugger panel, I also found a get.json file. Can't visit the source url, it'll fail, but can right-click and download the get.json. In get.json file, there're 4 m3u8_url that represent all 4 resolutions available for this video.
|
||||
# These 2 files might be what I should be looking for, guess so.
|
||||
|
||||
# Tried to copy the link and send the request via PYTHON request module, with headers, fail, 403
|
||||
# Tried to copy the cUrl and send via Insomnia, fail again, 403 forbidden
|
||||
# Tudou.com is a bit similar to Youku.com(already available in yt-dlp), Tudou.com is acquired by Youku.com many years ago, they're probably sharing some servers and I do find similar domains in these 2 sites
|
||||
# Therefore I also checked the Youku extractor, but don't know how they get to things like, line 119 'https://log.mmstat.com/eg.js'
|
||||
# I also searched the internet and found another code for Youku.com, in that code there're token settings, appKey, sign, etc..
|
||||
|
||||
# So I'm guessing, for Tudou.com, there might be something to do with the token too pass the authentication...
|
||||
# I'll keep looking into it, but if you can come up with any tips it'll be appreciated.
|
||||
video_id = self._html_search_regex(r'currentEncodeVid: \'(.+?)\',', webpage, 'xhtml')
|
||||
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
|
||||
video_url = self._html_search_regex(r'<meta property="og:url" content="(.+?)"/>', webpage, 'og:url')
|
||||
|
||||
print('==========')
|
||||
print(f'videoId = {video_id}')
|
||||
print(f'videoLongId = {videoLongId}')
|
||||
print(f'title = {title}')
|
||||
print(f'show_name = {show_name}')
|
||||
print(f'video_url = {video_url}')
|
||||
print(video_id)
|
||||
print(title)
|
||||
print(video_url)
|
||||
print('==========')
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'ext': 'mp4',
|
||||
'url': video_url,
|
||||
'show_name': show_name,
|
||||
'url': video_url
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user