2024-09-20 17:31:25 +02:00
2 changed files with 41 additions and 38 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1,4 +1,8 @@
 # flake8: noqa: F401
 from .tudou import(TudouIE,
    )
 from .youtube import (  # Youtube is moved to the top to improve performance
    YoutubeIE,
@ -2047,7 +2051,6 @@ from .tubitv import (
    TubiTvIE,
    TubiTvShowIE,
 )
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tunein import (
    TuneInStationIE,
--- a/yt_dlp/extractor/tudou.py
+++ b/yt_dlp/extractor/tudou.py
@ -2,63 +2,63 @@ from .common import InfoExtractor
 class TudouIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(?P<id>id_[\w=.]+)'
+    _VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(id_[a-zA-Z0-9_=.]+)'
    _TESTS = [{
        'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng',
-        'md5': 'to be updated',
+
        # this code successfully downloaded the .mp4 file, and passed the test, EXCEPT the md5 part.
        # I moved this code to the extractor folder of the released version of yt-dlp, trying to see if it works properly in there.
        # IF it worked, then I can calculate the md5 of the first 10kb, and compare the md5.
        # Unfortunately, it didn't.
        # I'm guessing it has problem in extracting the video id, but can't prove my guessing.
        # But it does work here, when run 'python test/test_download.py TestDownload.test_Tudou'
        # That's why it didn't pass the md5 test, because I couldn't download the first 10kb with the released version of yt-dlp.
        # So there's nothing to compare
        # I'm interested to know if there's another way to download the first 10kb.
        # Currently it's a paradox to me:
        # To finalise this code, I need to get the first 10kb, to do the last comparison.
        # But this code doesn't work in the released yt-dlp, so I can't get the first 10kb.
        # So how can I.....
        # 'f33b73e7470c45b7d3c4f7d8b34eda14',
        # this md5, is from the output of this command - 'python test/test_download.py TestDownload.test_Tudou'.
        # the downloaded file is deleted automatically, not giving me a chance to calculate its md5 manually.
        'md5': 'failed to get the first 10kb',
        'info_dict': {
            'id': 'XNjAxNjI2OTU3Ng==',
            'ext': 'mp4',
-            'title': '阿斯塔意识到哈里杀了人，自己被骗了',
+            'title': '外星居民 第一季 阿斯塔意识到哈里杀了人，自己被骗了-电视剧-高清完整正版视频在线观看-优酷',
            'show_name': '外星居民 第一季',
        },
        # 'skip': 'testing skip function',
    }]
    def _real_extract(self, url):
-        video_id = self._match_id(url)
+        # About video_id
        #   .get_temp_id return None
        #   ._match_id doesn't work as well
        # I don't know how to fix this, but line 49 works, it extracts id from the webpage
        # I think there might be a smarter way, but I'm just not smart enough
        video_id = self.get_temp_id(url)
        webpage = self._download_webpage(url, video_id)
-        data = self._search_json(r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data', video_id)
+
        # print('==========')
        # print(webpage)
-        # print(data)
+        video_id = self._html_search_regex(r'currentEncodeVid: \'(.+?)\',', webpage, 'xhtml')
-
+        title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
-        # The json file produced same results as the regex did, but it's much cleaner, thanks for the guide
+        video_url = self._html_search_regex(r'<meta property="og:url" content="(.+?)"/>', webpage, 'og:url')
        video_id = data['data']['data']['data']['extra']['videoId']
        videoLongId = str(data['data']['data']['data']['extra']['videoLongId'])
        title = data['data']['data']['data']['extra']['videoTitle']
        show_name = data['data']['data']['data']['extra']['showName']
        video_url = 'https://play.tudou.com' + data['config']['url']
        # About video_url
        # The video url is not stored in the json file above, instead, the website uses m3u8 scheme
        # With F12 developer tool, I've locked one request.
        # Each time I click the button to play the video, the browser will GET a .m3u8 file which contains urls of all clips of that video, in currently selected resolution (in the webpage player).
        # In Debugger panel, I also found a get.json file. Can't visit the source url, it'll fail, but can right-click and download the get.json. In get.json file, there're 4 m3u8_url that represent all 4 resolutions available for this video.
        # These 2 files might be what I should be looking for, guess so.
        # Tried to copy the link and send the request via PYTHON request module, with headers, fail, 403
        # Tried to copy the cUrl and send via Insomnia, fail again, 403 forbidden
        # Tudou.com is a bit similar to Youku.com(already available in yt-dlp), Tudou.com is acquired by Youku.com many years ago, they're probably sharing some servers and I do find similar domains in these 2 sites
        # Therefore I also checked the Youku extractor, but don't know how they get to things like, line 119 'https://log.mmstat.com/eg.js'
        # I also searched the internet and found another code for Youku.com, in that code there're token settings, appKey, sign, etc..
        # So I'm guessing, for Tudou.com, there might be something to do with the token too pass the authentication...
        # I'll keep looking into it, but if you can come up with any tips it'll be appreciated.
        print('==========')
-        print(f'videoId = {video_id}')
+        print(video_id)
-        print(f'videoLongId = {videoLongId}')
+        print(title)
-        print(f'title = {title}')
+        print(video_url)
        print(f'show_name = {show_name}')
        print(f'video_url = {video_url}')
        print('==========')
        return {
            'id': video_id,
            'title': title,
            'ext': 'mp4',
-            'url': video_url,
+            'url': video_url
            'show_name': show_name,
        }