2024-09-20 09:21:25 +02:00
2 changed files with 41 additions and 38 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1,4 +1,8 @@
 # flake8: noqa: F401
+from .tudou import(TudouIE,
+
+
+    )

 from .youtube import (  # Youtube is moved to the top to improve performance
    YoutubeIE,
@ -2047,7 +2051,6 @@ from .tubitv import (
    TubiTvIE,
    TubiTvShowIE,
 )
-from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tunein import (
    TuneInStationIE,
--- a/yt_dlp/extractor/tudou.py
+++ b/yt_dlp/extractor/tudou.py
@ -2,63 +2,63 @@ from .common import InfoExtractor


 class TudouIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(?P<id>id_[\w=.]+)'
+    _VALID_URL = r'https?://(?:play\.)?tudou\.com/v_show/(id_[a-zA-Z0-9_=.]+)'
    _TESTS = [{
        'url': 'https://play.tudou.com/v_show/id_XNjAxNjI2OTU3Ng',
-        'md5': 'to be updated',
+
+        # this code successfully downloaded the .mp4 file, and passed the test, EXCEPT the md5 part.
+        # I moved this code to the extractor folder of the released version of yt-dlp, trying to see if it works properly in there.
+        # IF it worked, then I can calculate the md5 of the first 10kb, and compare the md5.
+        # Unfortunately, it didn't.
+        # I'm guessing it has problem in extracting the video id, but can't prove my guessing.
+
+        # But it does work here, when run 'python test/test_download.py TestDownload.test_Tudou'
+        # That's why it didn't pass the md5 test, because I couldn't download the first 10kb with the released version of yt-dlp.
+        # So there's nothing to compare
+
+        # I'm interested to know if there's another way to download the first 10kb.
+        # Currently it's a paradox to me:
+        # To finalise this code, I need to get the first 10kb, to do the last comparison.
+        # But this code doesn't work in the released yt-dlp, so I can't get the first 10kb.
+        # So how can I.....
+
+        # 'f33b73e7470c45b7d3c4f7d8b34eda14',
+        # this md5, is from the output of this command - 'python test/test_download.py TestDownload.test_Tudou'.
+        # the downloaded file is deleted automatically, not giving me a chance to calculate its md5 manually.
+        'md5': 'failed to get the first 10kb',

        'info_dict': {
            'id': 'XNjAxNjI2OTU3Ng==',
            'ext': 'mp4',
-            'title': '阿斯塔意识到哈里杀了人，自己被骗了',
-            'show_name': '外星居民 第一季',
+            'title': '外星居民 第一季 阿斯塔意识到哈里杀了人，自己被骗了-电视剧-高清完整正版视频在线观看-优酷',
        },
        # 'skip': 'testing skip function',
    }]

    def _real_extract(self, url):
-        video_id = self._match_id(url)
+        # About video_id
+        #   .get_temp_id return None
+        #   ._match_id doesn't work as well
+        # I don't know how to fix this, but line 49 works, it extracts id from the webpage
+        # I think there might be a smarter way, but I'm just not smart enough
+        video_id = self.get_temp_id(url)
        webpage = self._download_webpage(url, video_id)
-        data = self._search_json(r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data', video_id)
+
        # print('==========')
        # print(webpage)
-        # print(data)
-
-        # The json file produced same results as the regex did, but it's much cleaner, thanks for the guide
-        video_id = data['data']['data']['data']['extra']['videoId']
-        videoLongId = str(data['data']['data']['data']['extra']['videoLongId'])
-        title = data['data']['data']['data']['extra']['videoTitle']
-        show_name = data['data']['data']['data']['extra']['showName']
-
-        video_url = 'https://play.tudou.com' + data['config']['url']
-        # About video_url
-        # The video url is not stored in the json file above, instead, the website uses m3u8 scheme
-        # With F12 developer tool, I've locked one request.
-        # Each time I click the button to play the video, the browser will GET a .m3u8 file which contains urls of all clips of that video, in currently selected resolution (in the webpage player).
-        # In Debugger panel, I also found a get.json file. Can't visit the source url, it'll fail, but can right-click and download the get.json. In get.json file, there're 4 m3u8_url that represent all 4 resolutions available for this video.
-        # These 2 files might be what I should be looking for, guess so.
-
-        # Tried to copy the link and send the request via PYTHON request module, with headers, fail, 403
-        # Tried to copy the cUrl and send via Insomnia, fail again, 403 forbidden
-        # Tudou.com is a bit similar to Youku.com(already available in yt-dlp), Tudou.com is acquired by Youku.com many years ago, they're probably sharing some servers and I do find similar domains in these 2 sites
-        # Therefore I also checked the Youku extractor, but don't know how they get to things like, line 119 'https://log.mmstat.com/eg.js'
-        # I also searched the internet and found another code for Youku.com, in that code there're token settings, appKey, sign, etc..
-
-        # So I'm guessing, for Tudou.com, there might be something to do with the token too pass the authentication...
-        # I'll keep looking into it, but if you can come up with any tips it'll be appreciated.
+        video_id = self._html_search_regex(r'currentEncodeVid: \'(.+?)\',', webpage, 'xhtml')
+        title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+        video_url = self._html_search_regex(r'<meta property="og:url" content="(.+?)"/>', webpage, 'og:url')

        print('==========')
-        print(f'videoId = {video_id}')
-        print(f'videoLongId = {videoLongId}')
-        print(f'title = {title}')
-        print(f'show_name = {show_name}')
-        print(f'video_url = {video_url}')
+        print(video_id)
+        print(title)
+        print(video_url)
        print('==========')

        return {
            'id': video_id,
            'title': title,
            'ext': 'mp4',
-            'url': video_url,
-            'show_name': show_name,
+            'url': video_url
        }