From 39cd85c27e3a392c8b186f92be046856d0ddd1a4 Mon Sep 17 00:00:00 2001
From: Jessie C <jessie.clarke32@gmail.com>
Date: Wed, 28 Mar 2018 12:46:55 +0200
Subject: [PATCH] =?UTF-8?q?=E2=80=A2=20New=20extractor=20for=20porn=20site?=
 =?UTF-8?q?=20ThisVid.=20By=20contrast=20with=20other=20extractors,=20it?=
 =?UTF-8?q?=20requires=20JavaScript=20in=20order=20to=20determine=20the=20?=
 =?UTF-8?q?video=20URL.=20That=20is=20why=20it=20uses=20Selenium=20(licens?=
 =?UTF-8?q?e=20Apache=202).=20The=20Generic=20extractor=20was=20already=20?=
 =?UTF-8?q?capable=20to=20download=20many=20of=20the=20videos=20but=20only?=
 =?UTF-8?q?=20in=20240p.=20=E2=80=A2=20Selenium=20requires=20the=20install?=
 =?UTF-8?q?ation=20of=20a=20free=20webdriver=20(see=20https://pypi.python.?=
 =?UTF-8?q?org/pypi/selenium=20),=20except=20for=20Safari=2010+=20?=
 =?UTF-8?q?=E2=80=A2=20Added=20new=20option=20--browser=20in=20the=20comma?=
 =?UTF-8?q?nd=20line=20to=20indicate=20the=20desired=20browser.=20It=20def?=
 =?UTF-8?q?aults=20to=20Safari=20on=20the=20Mac=20and=20Chrome=20elsewhere?=
 =?UTF-8?q?.=20NOTE=20that=20I=20have=20only=20tested=20Safari=20and=20Chr?=
 =?UTF-8?q?ome=20until=20now,=20though=20Selenium=20is=20compatible=20with?=
 =?UTF-8?q?=20'firefox',=20'edge',=20'ie',=20=20'opera',=20'webkitgtk',=20?=
 =?UTF-8?q?'android'=20=E2=80=A2=20ThisVid=20can=20detect=20several=20comm?=
 =?UTF-8?q?on=20errors=20like=20a=20deleted=20file=20or=20insufficient=20p?=
 =?UTF-8?q?rivileges=20=E2=80=A2=20ThisVid=20does=20not=20support=20authen?=
 =?UTF-8?q?tication=20yet?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 youtube_dl/__init__.py             |   1 +
 youtube_dl/extractor/extractors.py |   1 +
 youtube_dl/extractor/thisvid.py    | 169 +++++++++++++++++++++++++++++
 youtube_dl/options.py              |  10 ++
 4 files changed, 181 insertions(+)
 create mode 100755 youtube_dl/extractor/thisvid.py

diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 9bb952457..10c883c80 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -433,6 +433,7 @@ def _real_main(argv=None):
         # just for deprecation check
         'autonumber': opts.autonumber if opts.autonumber is True else None,
         'usetitle': opts.usetitle if opts.usetitle is True else None,
+        'web_driver': opts.web_driver,
     }
 
     with YoutubeDL(ydl_opts) as ydl:
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index de48a37ad..894d90718 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -1079,6 +1079,7 @@ from .theweatherchannel import TheWeatherChannelIE
 from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .thisoldhouse import ThisOldHouseIE
+from .thisvid import thisvidIE
 from .threeqsdn import ThreeQSDNIE
 from .tinypic import TinyPicIE
 from .tmz import (
diff --git a/youtube_dl/extractor/thisvid.py b/youtube_dl/extractor/thisvid.py
new file mode 100755
index 000000000..4e229a978
--- /dev/null
+++ b/youtube_dl/extractor/thisvid.py
@@ -0,0 +1,169 @@
+# coding: utf-8
+from __future__ import unicode_literals
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from ..utils import (
+    ExtractorError,
+)
+import re
+
+from .common import InfoExtractor
+
+# Process downloads from porn site ThisVid.com
+# Requires Selenium (license Apache 2.0)
+
+class thisvidIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<display_id>\w+)'
+    _TEST = {
+        'url': 'https://thisvid.com/videos/final-impact-3/',
+        'md5': '8302bd736a1e4198ed80db4a0d0dd012',
+        'info_dict': {
+            'id': '490698',
+            'ext': 'mp4',
+            'title': 'Final impact_3',
+        }
+    }
+
+    def try_find_element_attribute(self, driver, xpath, attr):
+        # If attr is "_text_", get element text instead
+        try:
+            if attr == "_text_":
+                return driver.find_element_by_xpath( xpath ).text
+            else:
+                return driver.find_element_by_xpath( xpath ).get_attribute( attr )
+        except:
+            return None
+
+    def try_find_elements_attribute(self, driver, xpath, attr):
+        # Same as try_find_element_attribute but for a list
+        try:
+            results = []
+            objs = driver.find_elements_by_xpath( xpath )
+
+            if attr == "_text_":
+                for t in objs:
+                    results.append(t.text)
+            else:
+                for t in objs:
+                    results.append(t.get_attribute(attr))
+
+            return results
+        except:
+            return None
+
+    def _real_extract(self, url):
+        dict = { 'age_limit': 18,
+                'ext': 'mp4',
+                 'extractor': 'ThisVid'}
+
+        be_verbose = self._downloader.params.get('verbose', False)
+
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        error_msg = ""
+        driver = None
+        source = ""
+        wanted_driver = self._downloader.params.get('web_driver')
+        w_driver = getattr(webdriver, wanted_driver.capitalize())
+        if be_verbose:
+            self.to_screen('Extractor started. Connecting to the selected browser (%s)' % wanted_driver)
+
+        try:
+            driver = w_driver()
+            driver.get(url)
+            source = driver.page_source
+        except:
+            error_msg = "Could not connect to the webdriver. Make sure you have installed the webdriver for '%s'" % wanted_driver
+            pass
+
+        # Check common errors
+        if "SORRY, THE FILE DOES NOT EXIST YET" in source:
+            # Error says 'yet' but removed files can also cause that error
+            error_msg = "This file does not exist on Thisvid.com"
+        elif "Sorry, this file was deleted" in source:
+            error_msg = "This file has been deleted from Thisvid.com"
+        elif "This video is a private video" in source:
+            error_msg = "This video is private"
+
+        if error_msg:
+            if driver is not None:
+                driver.quit()
+            raise ExtractorError(
+                'ThisVid said: %s' % error_msg,
+                expected=True)
+
+        # Click the Play button
+        content = driver.find_element_by_class_name('fp-play')
+        if content is not None:
+            content.click()
+        else:
+            driver.quit()
+            error_msg = "Page does not contain expected data"
+            raise ExtractorError(
+                'ThisVid said: %s' % error_msg,
+                expected=True)
+
+        try: # Until the true URL appears in the DOM
+            element = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CLASS_NAME, "fp-engine")))
+        except:
+            error_msg = "Browser timed out"
+            driver.quit()
+            raise ExtractorError(
+                'ThisVid said: %s' % error_msg,
+                expected=True)
+
+        og_url = self.try_find_element_attribute( driver, "//meta[@property='og:video:url'][1]", "content" )
+        hits = re.findall(r'/([0-9]+)/', og_url)
+        video_id = hits[0]
+        dict['id'] = video_id
+        if be_verbose:
+            self.to_screen("Found video id %s" % dict['id'])
+
+        dict['url'] = self.try_find_element_attribute( driver, "//video[@class='fp-engine'][1]", "src" )
+
+        video_title = self.try_find_element_attribute( driver, "//meta[@property='og:title'][1]", "content" )
+        video_title = video_title[:len(video_title)-14] # Strip the last " - ThisVid.com" from the title
+        dict['title'] = video_title
+
+        # Gather other data
+        try:
+            elem_names = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title']", "_text_" )
+            elem_values = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title-description']", "_text_" )
+            hits = re.findall(r'Rating:\s([0-9]\.[0-9])', elem_names[0])
+            video_rating = hits[0]
+
+            dict['view_count'] = int( elem_values[0] )
+            dict['release_date'] = elem_values[1]
+
+            hits = re.findall(r'([0-9]+):([0-9]+)', elem_values[2])
+            video_duration_in_seconds = int(hits[0][0])*60 + int(hits[0][1])
+            dict['duration'] = video_duration_in_seconds
+
+            dict['description'] = self.try_find_element_attribute(driver, "//meta[@property='og:description'][1]",
+                                                                  "content")
+            dict['tags'] = self.try_find_elements_attribute(driver, "//meta[@property='og:video:tag']", "content")
+            dict['width'] = int(
+                self.try_find_element_attribute(driver, "//meta[@property='og:video:width'][1]", "content"))
+            dict['height'] = int(
+                self.try_find_element_attribute(driver, "//meta[@property='og:video:height'][1]", "content"))
+            dict['thumbnail'] = self.try_find_element_attribute(driver, "//meta[@property='og:image'][1]", "content")
+
+            desc_block = driver.find_elements_by_xpath("//ul[@class='description']/li/a")
+
+            dict['categories'] = [desc_block[0].text] # There is only one category per movie
+            dict['uploader_id'] = desc_block[-1].text
+            dict['uploader_url'] = desc_block[-1].get_attribute("href")
+
+        except:
+            self.to_screen("Exception while getting extra info")
+            pass
+
+        driver.quit()
+
+        if be_verbose:
+            self.to_screen("Will return the following data")
+            self.to_screen( dict )
+
+        return dict
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 7d1bbc021..0660c26bd 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -4,6 +4,7 @@ import os.path
 import optparse
 import re
 import sys
+import platform
 
 from .downloader.external import list_external_downloaders
 from .compat import (
@@ -445,7 +446,16 @@ def parseOpts(overrideArguments=None):
         default=[], callback=_comma_separated_values_options_callback,
         help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags')
 
+    default_driver = 'chrome' #Default webdriver is Chrome, except on the Mac
+    if platform.system()=="Darwin":
+        default_driver = 'safari'
+
     downloader = optparse.OptionGroup(parser, 'Download Options')
+    downloader.add_option(
+        '--browser',
+        action='store', dest='web_driver', choices=['safari', 'chrome', 'firefox', 'edge', 'ie',  'opera', 'webkitgtk', 'android'],
+        default=default_driver,
+        help='Browser to use for websites requiring interaction. See Selenium for more information.')
     downloader.add_option(
         '-r', '--limit-rate', '--rate-limit',
         dest='ratelimit', metavar='RATE',