• New extractor for porn site ThisVid. By contrast with other extractors, it requires JavaScript in order to determine the video URL. That is why it uses Selenium (license Apache 2). The Generic extractor was already capable to download many of the videos but only in 240p.

• Selenium requires the installation of a free webdriver (see https://pypi.python.org/pypi/selenium ), except for Safari 10+ • Added new option --browser in the command line to indicate the desired browser. It defaults to Safari on the Mac and Chrome elsewhere. NOTE that I have only tested Safari and Chrome until now, though Selenium is compatible with 'firefox', 'edge', 'ie', 'opera', 'webkitgtk', 'android' • ThisVid can detect several common errors like a deleted file or insufficient privileges • ThisVid does not support authentication yet
2025-02-03 01:05:44 +08:00 · 2018-03-28 12:46:55 +02:00 · 2018-03-28 12:46:55 +02:00 · 39cd85c27e
commit 39cd85c27e
parent 5634bbfe7f
4 changed files with 181 additions and 0 deletions
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -433,6 +433,7 @@ def _real_main(argv=None):
        # just for deprecation check
        'autonumber': opts.autonumber if opts.autonumber is True else None,
        'usetitle': opts.usetitle if opts.usetitle is True else None,
+        'web_driver': opts.web_driver,
    }

    with YoutubeDL(ydl_opts) as ydl:
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1079,6 +1079,7 @@ from .theweatherchannel import TheWeatherChannelIE
 from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .thisoldhouse import ThisOldHouseIE
+from .thisvid import thisvidIE
 from .threeqsdn import ThreeQSDNIE
 from .tinypic import TinyPicIE
 from .tmz import (
--- a/youtube_dl/extractor/thisvid.py
+++ b/youtube_dl/extractor/thisvid.py
@ -0,0 +1,169 @@
+# coding: utf-8
+from __future__ import unicode_literals
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from ..utils import (
+    ExtractorError,
+)
+import re
+
+from .common import InfoExtractor
+
+# Process downloads from porn site ThisVid.com
+# Requires Selenium (license Apache 2.0)
+
+class thisvidIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<display_id>\w+)'
+    _TEST = {
+        'url': 'https://thisvid.com/videos/final-impact-3/',
+        'md5': '8302bd736a1e4198ed80db4a0d0dd012',
+        'info_dict': {
+            'id': '490698',
+            'ext': 'mp4',
+            'title': 'Final impact_3',
+        }
+    }
+
+    def try_find_element_attribute(self, driver, xpath, attr):
+        # If attr is "_text_", get element text instead
+        try:
+            if attr == "_text_":
+                return driver.find_element_by_xpath( xpath ).text
+            else:
+                return driver.find_element_by_xpath( xpath ).get_attribute( attr )
+        except:
+            return None
+
+    def try_find_elements_attribute(self, driver, xpath, attr):
+        # Same as try_find_element_attribute but for a list
+        try:
+            results = []
+            objs = driver.find_elements_by_xpath( xpath )
+
+            if attr == "_text_":
+                for t in objs:
+                    results.append(t.text)
+            else:
+                for t in objs:
+                    results.append(t.get_attribute(attr))
+
+            return results
+        except:
+            return None
+
+    def _real_extract(self, url):
+        dict = { 'age_limit': 18,
+                'ext': 'mp4',
+                 'extractor': 'ThisVid'}
+
+        be_verbose = self._downloader.params.get('verbose', False)
+
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        error_msg = ""
+        driver = None
+        source = ""
+        wanted_driver = self._downloader.params.get('web_driver')
+        w_driver = getattr(webdriver, wanted_driver.capitalize())
+        if be_verbose:
+            self.to_screen('Extractor started. Connecting to the selected browser (%s)' % wanted_driver)
+
+        try:
+            driver = w_driver()
+            driver.get(url)
+            source = driver.page_source
+        except:
+            error_msg = "Could not connect to the webdriver. Make sure you have installed the webdriver for '%s'" % wanted_driver
+            pass
+
+        # Check common errors
+        if "SORRY, THE FILE DOES NOT EXIST YET" in source:
+            # Error says 'yet' but removed files can also cause that error
+            error_msg = "This file does not exist on Thisvid.com"
+        elif "Sorry, this file was deleted" in source:
+            error_msg = "This file has been deleted from Thisvid.com"
+        elif "This video is a private video" in source:
+            error_msg = "This video is private"
+
+        if error_msg:
+            if driver is not None:
+                driver.quit()
+            raise ExtractorError(
+                'ThisVid said: %s' % error_msg,
+                expected=True)
+
+        # Click the Play button
+        content = driver.find_element_by_class_name('fp-play')
+        if content is not None:
+            content.click()
+        else:
+            driver.quit()
+            error_msg = "Page does not contain expected data"
+            raise ExtractorError(
+                'ThisVid said: %s' % error_msg,
+                expected=True)
+
+        try: # Until the true URL appears in the DOM
+            element = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CLASS_NAME, "fp-engine")))
+        except:
+            error_msg = "Browser timed out"
+            driver.quit()
+            raise ExtractorError(
+                'ThisVid said: %s' % error_msg,
+                expected=True)
+
+        og_url = self.try_find_element_attribute( driver, "//meta[@property='og:video:url'][1]", "content" )
+        hits = re.findall(r'/([0-9]+)/', og_url)
+        video_id = hits[0]
+        dict['id'] = video_id
+        if be_verbose:
+            self.to_screen("Found video id %s" % dict['id'])
+
+        dict['url'] = self.try_find_element_attribute( driver, "//video[@class='fp-engine'][1]", "src" )
+
+        video_title = self.try_find_element_attribute( driver, "//meta[@property='og:title'][1]", "content" )
+        video_title = video_title[:len(video_title)-14] # Strip the last " - ThisVid.com" from the title
+        dict['title'] = video_title
+
+        # Gather other data
+        try:
+            elem_names = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title']", "_text_" )
+            elem_values = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title-description']", "_text_" )
+            hits = re.findall(r'Rating:\s([0-9]\.[0-9])', elem_names[0])
+            video_rating = hits[0]
+
+            dict['view_count'] = int( elem_values[0] )
+            dict['release_date'] = elem_values[1]
+
+            hits = re.findall(r'([0-9]+):([0-9]+)', elem_values[2])
+            video_duration_in_seconds = int(hits[0][0])*60 + int(hits[0][1])
+            dict['duration'] = video_duration_in_seconds
+
+            dict['description'] = self.try_find_element_attribute(driver, "//meta[@property='og:description'][1]",
+                                                                  "content")
+            dict['tags'] = self.try_find_elements_attribute(driver, "//meta[@property='og:video:tag']", "content")
+            dict['width'] = int(
+                self.try_find_element_attribute(driver, "//meta[@property='og:video:width'][1]", "content"))
+            dict['height'] = int(
+                self.try_find_element_attribute(driver, "//meta[@property='og:video:height'][1]", "content"))
+            dict['thumbnail'] = self.try_find_element_attribute(driver, "//meta[@property='og:image'][1]", "content")
+
+            desc_block = driver.find_elements_by_xpath("//ul[@class='description']/li/a")
+
+            dict['categories'] = [desc_block[0].text] # There is only one category per movie
+            dict['uploader_id'] = desc_block[-1].text
+            dict['uploader_url'] = desc_block[-1].get_attribute("href")
+
+        except:
+            self.to_screen("Exception while getting extra info")
+            pass
+
+        driver.quit()
+
+        if be_verbose:
+            self.to_screen("Will return the following data")
+            self.to_screen( dict )
+
+        return dict
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@ -4,6 +4,7 @@ import os.path
 import optparse
 import re
 import sys
+import platform

 from .downloader.external import list_external_downloaders
 from .compat import (
@ -445,7 +446,16 @@ def parseOpts(overrideArguments=None):
        default=[], callback=_comma_separated_values_options_callback,
        help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags')

+    default_driver = 'chrome' #Default webdriver is Chrome, except on the Mac
+    if platform.system()=="Darwin":
+        default_driver = 'safari'
+
    downloader = optparse.OptionGroup(parser, 'Download Options')
+    downloader.add_option(
+        '--browser',
+        action='store', dest='web_driver', choices=['safari', 'chrome', 'firefox', 'edge', 'ie',  'opera', 'webkitgtk', 'android'],
+        default=default_driver,
+        help='Browser to use for websites requiring interaction. See Selenium for more information.')
    downloader.add_option(
        '-r', '--limit-rate', '--rate-limit',
        dest='ratelimit', metavar='RATE',