Compare commits

..

No commits in common. "360ecdd6145f304d38ba0735f52b3622d3634808" and "197bcd877c012ee2d5bfc9ae1d63eed08362c62f" have entirely different histories.

2 changed files with 41 additions and 38 deletions

View File

@ -1,4 +1,8 @@
# flake8: noqa: F401 # flake8: noqa: F401
from .tudou import(TudouIE,
)
from .youtube import ( # Youtube is moved to the top to improve performance from .youtube import ( # Youtube is moved to the top to improve performance
YoutubeIE, YoutubeIE,
@ -2047,7 +2051,6 @@ from .tubitv import (
TubiTvIE, TubiTvIE,
TubiTvShowIE, TubiTvShowIE,
) )
from .tudou import TudouIE
from .tumblr import TumblrIE from .tumblr import TumblrIE
from .tunein import ( from .tunein import (
TuneInStationIE, TuneInStationIE,

View File

@ -2,63 +2,63 @@ from .common import InfoExtractor
class TudouIE(InfoExtractor): class TudouIE(InfoExtractor):
_VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(?P<id>id_[\w=.]+)' _VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(id_[a-zA-Z0-9_=.]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng', 'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng',
'md5': 'to be updated',
# this code successfully downloaded the .mp4 file, and passed the test, EXCEPT the md5 part.
# I moved this code to the extractor folder of the released version of yt-dlp, trying to see if it works properly in there.
# IF it worked, then I can calculate the md5 of the first 10kb, and compare the md5.
# Unfortunately, it didn't.
# I'm guessing it has problem in extracting the video id, but can't prove my guessing.
# But it does work here, when run 'python test/test_download.py TestDownload.test_Tudou'
# That's why it didn't pass the md5 test, because I couldn't download the first 10kb with the released version of yt-dlp.
# So there's nothing to compare
# I'm interested to know if there's another way to download the first 10kb.
# Currently it's a paradox to me:
# To finalise this code, I need to get the first 10kb, to do the last comparison.
# But this code doesn't work in the released yt-dlp, so I can't get the first 10kb.
# So how can I.....
# 'f33b73e7470c45b7d3c4f7d8b34eda14',
# this md5, is from the output of this command - 'python test/test_download.py TestDownload.test_Tudou'.
# the downloaded file is deleted automatically, not giving me a chance to calculate its md5 manually.
'md5': 'failed to get the first 10kb',
'info_dict': { 'info_dict': {
'id': 'XNjAxNjI2OTU3Ng==', 'id': 'XNjAxNjI2OTU3Ng==',
'ext': 'mp4', 'ext': 'mp4',
'title': '阿斯塔意识到哈里杀了人,自己被骗了', 'title': '外星居民 第一季 阿斯塔意识到哈里杀了人,自己被骗了-电视剧-高清完整正版视频在线观看-优酷',
'show_name': '外星居民 第一季',
}, },
# 'skip': 'testing skip function', # 'skip': 'testing skip function',
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) # About video_id
# .get_temp_id return None
# ._match_id doesn't work as well
# I don't know how to fix this, but line 49 works, it extracts id from the webpage
# I think there might be a smarter way, but I'm just not smart enough
video_id = self.get_temp_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
data = self._search_json(r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data', video_id)
# print('==========') # print('==========')
# print(webpage) # print(webpage)
# print(data) video_id = self._html_search_regex(r'currentEncodeVid: \'(.+?)\',', webpage, 'xhtml')
title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
# The json file produced same results as the regex did, but it's much cleaner, thanks for the guide video_url = self._html_search_regex(r'<meta property="og:url" content="(.+?)"/>', webpage, 'og:url')
video_id = data['data']['data']['data']['extra']['videoId']
videoLongId = str(data['data']['data']['data']['extra']['videoLongId'])
title = data['data']['data']['data']['extra']['videoTitle']
show_name = data['data']['data']['data']['extra']['showName']
video_url = 'https://play.tudou.com' + data['config']['url']
# About video_url
# The video url is not stored in the json file above, instead, the website uses m3u8 scheme
# With F12 developer tool, I've locked one request.
# Each time I click the button to play the video, the browser will GET a .m3u8 file which contains urls of all clips of that video, in currently selected resolution (in the webpage player).
# In Debugger panel, I also found a get.json file. Can't visit the source url, it'll fail, but can right-click and download the get.json. In get.json file, there're 4 m3u8_url that represent all 4 resolutions available for this video.
# These 2 files might be what I should be looking for, guess so.
# Tried to copy the link and send the request via PYTHON request module, with headers, fail, 403
# Tried to copy the cUrl and send via Insomnia, fail again, 403 forbidden
# Tudou.com is a bit similar to Youku.com(already available in yt-dlp), Tudou.com is acquired by Youku.com many years ago, they're probably sharing some servers and I do find similar domains in these 2 sites
# Therefore I also checked the Youku extractor, but don't know how they get to things like, line 119 'https://log.mmstat.com/eg.js'
# I also searched the internet and found another code for Youku.com, in that code there're token settings, appKey, sign, etc..
# So I'm guessing, for Tudou.com, there might be something to do with the token too pass the authentication...
# I'll keep looking into it, but if you can come up with any tips it'll be appreciated.
print('==========') print('==========')
print(f'videoId = {video_id}') print(video_id)
print(f'videoLongId = {videoLongId}') print(title)
print(f'title = {title}') print(video_url)
print(f'show_name = {show_name}')
print(f'video_url = {video_url}')
print('==========') print('==========')
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'ext': 'mp4', 'ext': 'mp4',
'url': video_url, 'url': video_url
'show_name': show_name,
} }