From 39cd85c27e3a392c8b186f92be046856d0ddd1a4 Mon Sep 17 00:00:00 2001 From: Jessie C Date: Wed, 28 Mar 2018 12:46:55 +0200 Subject: [PATCH] =?UTF-8?q?=E2=80=A2=20New=20extractor=20for=20porn=20site?= =?UTF-8?q?=20ThisVid.=20By=20contrast=20with=20other=20extractors,=20it?= =?UTF-8?q?=20requires=20JavaScript=20in=20order=20to=20determine=20the=20?= =?UTF-8?q?video=20URL.=20That=20is=20why=20it=20uses=20Selenium=20(licens?= =?UTF-8?q?e=20Apache=202).=20The=20Generic=20extractor=20was=20already=20?= =?UTF-8?q?capable=20to=20download=20many=20of=20the=20videos=20but=20only?= =?UTF-8?q?=20in=20240p.=20=E2=80=A2=20Selenium=20requires=20the=20install?= =?UTF-8?q?ation=20of=20a=20free=20webdriver=20(see=20https://pypi.python.?= =?UTF-8?q?org/pypi/selenium=20),=20except=20for=20Safari=2010+=20?= =?UTF-8?q?=E2=80=A2=20Added=20new=20option=20--browser=20in=20the=20comma?= =?UTF-8?q?nd=20line=20to=20indicate=20the=20desired=20browser.=20It=20def?= =?UTF-8?q?aults=20to=20Safari=20on=20the=20Mac=20and=20Chrome=20elsewhere?= =?UTF-8?q?.=20NOTE=20that=20I=20have=20only=20tested=20Safari=20and=20Chr?= =?UTF-8?q?ome=20until=20now,=20though=20Selenium=20is=20compatible=20with?= =?UTF-8?q?=20'firefox',=20'edge',=20'ie',=20=20'opera',=20'webkitgtk',=20?= =?UTF-8?q?'android'=20=E2=80=A2=20ThisVid=20can=20detect=20several=20comm?= =?UTF-8?q?on=20errors=20like=20a=20deleted=20file=20or=20insufficient=20p?= =?UTF-8?q?rivileges=20=E2=80=A2=20ThisVid=20does=20not=20support=20authen?= =?UTF-8?q?tication=20yet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- youtube_dl/__init__.py | 1 + youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/thisvid.py | 169 +++++++++++++++++++++++++++++ youtube_dl/options.py | 10 ++ 4 files changed, 181 insertions(+) create mode 100755 youtube_dl/extractor/thisvid.py diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9bb952457..10c883c80 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -433,6 +433,7 @@ def _real_main(argv=None): # just for deprecation check 'autonumber': opts.autonumber if opts.autonumber is True else None, 'usetitle': opts.usetitle if opts.usetitle is True else None, + 'web_driver': opts.web_driver, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de48a37ad..894d90718 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1079,6 +1079,7 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import thisvidIE from .threeqsdn import ThreeQSDNIE from .tinypic import TinyPicIE from .tmz import ( diff --git a/youtube_dl/extractor/thisvid.py b/youtube_dl/extractor/thisvid.py new file mode 100755 index 000000000..4e229a978 --- /dev/null +++ b/youtube_dl/extractor/thisvid.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from ..utils import ( + ExtractorError, +) +import re + +from .common import InfoExtractor + +# Process downloads from porn site ThisVid.com +# Requires Selenium (license Apache 2.0) + +class thisvidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P\w+)' + _TEST = { + 'url': 'https://thisvid.com/videos/final-impact-3/', + 'md5': '8302bd736a1e4198ed80db4a0d0dd012', + 'info_dict': { + 'id': '490698', + 'ext': 'mp4', + 'title': 'Final impact_3', + } + } + + def try_find_element_attribute(self, driver, xpath, attr): + # If attr is "_text_", get element text instead + try: + if attr == "_text_": + return driver.find_element_by_xpath( xpath ).text + else: + return driver.find_element_by_xpath( xpath ).get_attribute( attr ) + except: + return None + + def try_find_elements_attribute(self, driver, xpath, attr): + # Same as try_find_element_attribute but for a list + try: + results = [] + objs = driver.find_elements_by_xpath( xpath ) + + if attr == "_text_": + for t in objs: + results.append(t.text) + else: + for t in objs: + results.append(t.get_attribute(attr)) + + return results + except: + return None + + def _real_extract(self, url): + dict = { 'age_limit': 18, + 'ext': 'mp4', + 'extractor': 'ThisVid'} + + be_verbose = self._downloader.params.get('verbose', False) + + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + error_msg = "" + driver = None + source = "" + wanted_driver = self._downloader.params.get('web_driver') + w_driver = getattr(webdriver, wanted_driver.capitalize()) + if be_verbose: + self.to_screen('Extractor started. Connecting to the selected browser (%s)' % wanted_driver) + + try: + driver = w_driver() + driver.get(url) + source = driver.page_source + except: + error_msg = "Could not connect to the webdriver. Make sure you have installed the webdriver for '%s'" % wanted_driver + pass + + # Check common errors + if "SORRY, THE FILE DOES NOT EXIST YET" in source: + # Error says 'yet' but removed files can also cause that error + error_msg = "This file does not exist on Thisvid.com" + elif "Sorry, this file was deleted" in source: + error_msg = "This file has been deleted from Thisvid.com" + elif "This video is a private video" in source: + error_msg = "This video is private" + + if error_msg: + if driver is not None: + driver.quit() + raise ExtractorError( + 'ThisVid said: %s' % error_msg, + expected=True) + + # Click the Play button + content = driver.find_element_by_class_name('fp-play') + if content is not None: + content.click() + else: + driver.quit() + error_msg = "Page does not contain expected data" + raise ExtractorError( + 'ThisVid said: %s' % error_msg, + expected=True) + + try: # Until the true URL appears in the DOM + element = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CLASS_NAME, "fp-engine"))) + except: + error_msg = "Browser timed out" + driver.quit() + raise ExtractorError( + 'ThisVid said: %s' % error_msg, + expected=True) + + og_url = self.try_find_element_attribute( driver, "//meta[@property='og:video:url'][1]", "content" ) + hits = re.findall(r'/([0-9]+)/', og_url) + video_id = hits[0] + dict['id'] = video_id + if be_verbose: + self.to_screen("Found video id %s" % dict['id']) + + dict['url'] = self.try_find_element_attribute( driver, "//video[@class='fp-engine'][1]", "src" ) + + video_title = self.try_find_element_attribute( driver, "//meta[@property='og:title'][1]", "content" ) + video_title = video_title[:len(video_title)-14] # Strip the last " - ThisVid.com" from the title + dict['title'] = video_title + + # Gather other data + try: + elem_names = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title']", "_text_" ) + elem_values = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title-description']", "_text_" ) + hits = re.findall(r'Rating:\s([0-9]\.[0-9])', elem_names[0]) + video_rating = hits[0] + + dict['view_count'] = int( elem_values[0] ) + dict['release_date'] = elem_values[1] + + hits = re.findall(r'([0-9]+):([0-9]+)', elem_values[2]) + video_duration_in_seconds = int(hits[0][0])*60 + int(hits[0][1]) + dict['duration'] = video_duration_in_seconds + + dict['description'] = self.try_find_element_attribute(driver, "//meta[@property='og:description'][1]", + "content") + dict['tags'] = self.try_find_elements_attribute(driver, "//meta[@property='og:video:tag']", "content") + dict['width'] = int( + self.try_find_element_attribute(driver, "//meta[@property='og:video:width'][1]", "content")) + dict['height'] = int( + self.try_find_element_attribute(driver, "//meta[@property='og:video:height'][1]", "content")) + dict['thumbnail'] = self.try_find_element_attribute(driver, "//meta[@property='og:image'][1]", "content") + + desc_block = driver.find_elements_by_xpath("//ul[@class='description']/li/a") + + dict['categories'] = [desc_block[0].text] # There is only one category per movie + dict['uploader_id'] = desc_block[-1].text + dict['uploader_url'] = desc_block[-1].get_attribute("href") + + except: + self.to_screen("Exception while getting extra info") + pass + + driver.quit() + + if be_verbose: + self.to_screen("Will return the following data") + self.to_screen( dict ) + + return dict diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 7d1bbc021..0660c26bd 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -4,6 +4,7 @@ import os.path import optparse import re import sys +import platform from .downloader.external import list_external_downloaders from .compat import ( @@ -445,7 +446,16 @@ def parseOpts(overrideArguments=None): default=[], callback=_comma_separated_values_options_callback, help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags') + default_driver = 'chrome' #Default webdriver is Chrome, except on the Mac + if platform.system()=="Darwin": + default_driver = 'safari' + downloader = optparse.OptionGroup(parser, 'Download Options') + downloader.add_option( + '--browser', + action='store', dest='web_driver', choices=['safari', 'chrome', 'firefox', 'edge', 'ie', 'opera', 'webkitgtk', 'android'], + default=default_driver, + help='Browser to use for websites requiring interaction. See Selenium for more information.') downloader.add_option( '-r', '--limit-rate', '--rate-limit', dest='ratelimit', metavar='RATE',