[thisvid] Add extractor

2025-03-09 09:39:57 +08:00 · 2019-06-14 21:22:27 -05:00 · 2019-06-14 21:22:27 -05:00 · 23c4fb5e3e
commit 23c4fb5e3e
parent 695720ebe8
2 changed files with 136 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1162,6 +1162,10 @@ from .theweatherchannel import TheWeatherChannelIE
 from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .thisoldhouse import ThisOldHouseIE
+from .thisvid import (
+    ThisVidIE,
+    ThisVidEmbeddedIE
+)
 from .threeqsdn import ThreeQSDNIE
 from .tiktok import (
    TikTokIE,
--- a/youtube_dl/extractor/thisvid.py
+++ b/youtube_dl/extractor/thisvid.py
@ -0,0 +1,132 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from .openload import PhantomJSwrapper
+
+
+class ThisVidIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<id>[A-Za-z0-9-]+)'
+
+    _TEST = {
+        'url': 'https://thisvid.com/videos/madonna-show-in-sexy-underwear/',
+        'md5': '48e38730d38394c6e9f1cce66fb04c6e',
+        'info_dict': {
+            'id': '829503',
+            'display_id': 'madonna-show-in-sexy-underwear',
+            'ext': 'mp4',
+            'title': 'Madonna show in sexy underwear',
+            'thumbnail': r're:^https?://.*preview\.mp4\.jpg$',
+            'uploader_id': 'Mike_Hunt',
+            'uploader_url': 'https://thisvid.com/members/584768',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        # The webpage contains a raw piece of javascript which creates a
+        # variable called flashvars used by the player to update the webpage.
+        #
+        # Running the javascript fixes the URL in the static HTML which is
+        # broken, and updates the flashvars variable with the new info.
+        #
+        # Because there are a ton of errors from PhantomJS that do not affect
+        # the output, they have to be split from the actual JSON.
+        jscode = """
+        function checkFlashVars() {
+            flashvars = page.evaluate(function() {
+                return JSON.stringify(flashvars)
+            });
+            console.log('---'); // ensure any errors appear above where we are
+            console.log(flashvars);
+            saveAndExit();
+        }
+        checkFlashVars();"""
+        phantom = PhantomJSwrapper(self, required_version='2.0')
+        webpage, output = phantom.get(url, html=webpage, jscode=jscode)
+        flashvars = self._parse_json(output.split("---", 2)[1], display_id)
+
+        # Get the video URL from the flashvars.
+        video_url = flashvars['video_url']
+
+        # The value in the static HTML starts with "function/0/http://..."
+        # where the zero is sometimes another number.
+        #
+        # At try that static URL if there was a static update failure.
+        if video_url.startswith('function'):
+            self.report_warning('Page JS failed, fetch will likely fail')
+            video_url = video_url.split("/", 3)[2]
+
+        # Sometimes the video url ends with ".mp4",
+        # other times it ends with ".mp4/",
+        # yet other times it ends with ".mp4/?".
+        #
+        # All of it needs to be cleaned up.
+        video_url = video_url.split("?", 2)[0].strip("/")
+
+        # Get the thumbnail URL from the flashvars.
+        thumbnail_url = flashvars['preview_url']
+
+        # The thumbnail usually does not have a protocol on the front, e.g.
+        # "//media.thisvid.com"
+        if thumbnail_url.startswith("//"):
+            thumbnail_url = 'https:' + thumbnail_url
+
+        # The simplest way to get the real internal ID is to get it from the
+        # URL we will be accessing.
+        video_id = video_url.split("/")[-2]
+
+        # Parse the title information.
+        title = self._search_regex(r'<title>(?P<title>.+) -([a-zA-Z ]+ at)? ThisVid(\.com| tube)</title>',
+                                   webpage, display_id, group='title')
+
+        # Parse the author information from a profile link.
+        author_re = r'<a.*class="author" href="(?P<url>[^"]+)"[^>]*>(?P<id>[^<]+)</a'
+        uploader_id = self._search_regex(author_re, webpage, display_id,
+                                         group='id')
+        uploader_url = self._search_regex(author_re, webpage, display_id,
+                                          group='url').strip("/")
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'url': video_url,
+            'uploader_id': uploader_id,
+            'uploader_url': uploader_url,
+            'thumbnail': thumbnail_url,
+            'age_limit': 18,
+        }
+
+
+class ThisVidEmbeddedIE(ThisVidIE):
+    _VALID_URL = r'https?://(?:www\.)?thisvid\.com/embed/(?P<id>[A-Za-z0-9-]+)'
+    _TEST = {
+        'url': 'https://thisvid.com/embed/854312',
+        'md5': '8166497c0281b54a48b179c997463892',
+        'info_dict': {
+            'id': '854312',
+            'display_id': 'soles-of-jaxwheeler',
+            'ext': 'mp4',
+            'title': 'Soles of JaxWheeler',
+            'thumbnail': r're:^https?://.*preview\.mp4\.jpg$',
+            'uploader_id': 'SNK13',
+            'uploader_url': 'https://thisvid.com/members/252887',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        embedded_page = self._download_webpage(url, video_id)
+        full_url = self._search_regex(
+                r'<link rel="canonical" href="(?P<href>[^"]+)"',
+                embedded_page, video_id, group='href')
+        video_id = full_url.strip("/").rsplit("/", 1)[-1]
+        return self.url_result(full_url, video_id=video_id)