1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-03 01:05:44 +08:00

• New extractor for porn site ThisVid. By contrast with other extractors, it requires JavaScript in order to determine the video URL. That is why it uses Selenium (license Apache 2). The Generic extractor was already capable to download many of the videos but only in 240p.

• Selenium requires the installation of a free webdriver (see https://pypi.python.org/pypi/selenium ), except for Safari 10+
• Added new option --browser in the command line to indicate the desired browser. It defaults to Safari on the Mac and Chrome elsewhere. NOTE that I have only tested Safari and Chrome until now, though Selenium is compatible with 'firefox', 'edge', 'ie',  'opera', 'webkitgtk', 'android'
• ThisVid can detect several common errors like a deleted file or insufficient privileges
• ThisVid does not support authentication yet
This commit is contained in:
Jessie C 2018-03-28 12:46:55 +02:00
parent 5634bbfe7f
commit 39cd85c27e
4 changed files with 181 additions and 0 deletions

View File

@ -433,6 +433,7 @@ def _real_main(argv=None):
# just for deprecation check
'autonumber': opts.autonumber if opts.autonumber is True else None,
'usetitle': opts.usetitle if opts.usetitle is True else None,
'web_driver': opts.web_driver,
}
with YoutubeDL(ydl_opts) as ydl:

View File

@ -1079,6 +1079,7 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .thisvid import thisvidIE
from .threeqsdn import ThreeQSDNIE
from .tinypic import TinyPicIE
from .tmz import (

169
youtube_dl/extractor/thisvid.py Executable file
View File

@ -0,0 +1,169 @@
# coding: utf-8
from __future__ import unicode_literals
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ..utils import (
ExtractorError,
)
import re
from .common import InfoExtractor
# Process downloads from porn site ThisVid.com
# Requires Selenium (license Apache 2.0)
class thisvidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<display_id>\w+)'
_TEST = {
'url': 'https://thisvid.com/videos/final-impact-3/',
'md5': '8302bd736a1e4198ed80db4a0d0dd012',
'info_dict': {
'id': '490698',
'ext': 'mp4',
'title': 'Final impact_3',
}
}
def try_find_element_attribute(self, driver, xpath, attr):
# If attr is "_text_", get element text instead
try:
if attr == "_text_":
return driver.find_element_by_xpath( xpath ).text
else:
return driver.find_element_by_xpath( xpath ).get_attribute( attr )
except:
return None
def try_find_elements_attribute(self, driver, xpath, attr):
# Same as try_find_element_attribute but for a list
try:
results = []
objs = driver.find_elements_by_xpath( xpath )
if attr == "_text_":
for t in objs:
results.append(t.text)
else:
for t in objs:
results.append(t.get_attribute(attr))
return results
except:
return None
def _real_extract(self, url):
dict = { 'age_limit': 18,
'ext': 'mp4',
'extractor': 'ThisVid'}
be_verbose = self._downloader.params.get('verbose', False)
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
error_msg = ""
driver = None
source = ""
wanted_driver = self._downloader.params.get('web_driver')
w_driver = getattr(webdriver, wanted_driver.capitalize())
if be_verbose:
self.to_screen('Extractor started. Connecting to the selected browser (%s)' % wanted_driver)
try:
driver = w_driver()
driver.get(url)
source = driver.page_source
except:
error_msg = "Could not connect to the webdriver. Make sure you have installed the webdriver for '%s'" % wanted_driver
pass
# Check common errors
if "SORRY, THE FILE DOES NOT EXIST YET" in source:
# Error says 'yet' but removed files can also cause that error
error_msg = "This file does not exist on Thisvid.com"
elif "Sorry, this file was deleted" in source:
error_msg = "This file has been deleted from Thisvid.com"
elif "This video is a private video" in source:
error_msg = "This video is private"
if error_msg:
if driver is not None:
driver.quit()
raise ExtractorError(
'ThisVid said: %s' % error_msg,
expected=True)
# Click the Play button
content = driver.find_element_by_class_name('fp-play')
if content is not None:
content.click()
else:
driver.quit()
error_msg = "Page does not contain expected data"
raise ExtractorError(
'ThisVid said: %s' % error_msg,
expected=True)
try: # Until the true URL appears in the DOM
element = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CLASS_NAME, "fp-engine")))
except:
error_msg = "Browser timed out"
driver.quit()
raise ExtractorError(
'ThisVid said: %s' % error_msg,
expected=True)
og_url = self.try_find_element_attribute( driver, "//meta[@property='og:video:url'][1]", "content" )
hits = re.findall(r'/([0-9]+)/', og_url)
video_id = hits[0]
dict['id'] = video_id
if be_verbose:
self.to_screen("Found video id %s" % dict['id'])
dict['url'] = self.try_find_element_attribute( driver, "//video[@class='fp-engine'][1]", "src" )
video_title = self.try_find_element_attribute( driver, "//meta[@property='og:title'][1]", "content" )
video_title = video_title[:len(video_title)-14] # Strip the last " - ThisVid.com" from the title
dict['title'] = video_title
# Gather other data
try:
elem_names = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title']", "_text_" )
elem_values = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title-description']", "_text_" )
hits = re.findall(r'Rating:\s([0-9]\.[0-9])', elem_names[0])
video_rating = hits[0]
dict['view_count'] = int( elem_values[0] )
dict['release_date'] = elem_values[1]
hits = re.findall(r'([0-9]+):([0-9]+)', elem_values[2])
video_duration_in_seconds = int(hits[0][0])*60 + int(hits[0][1])
dict['duration'] = video_duration_in_seconds
dict['description'] = self.try_find_element_attribute(driver, "//meta[@property='og:description'][1]",
"content")
dict['tags'] = self.try_find_elements_attribute(driver, "//meta[@property='og:video:tag']", "content")
dict['width'] = int(
self.try_find_element_attribute(driver, "//meta[@property='og:video:width'][1]", "content"))
dict['height'] = int(
self.try_find_element_attribute(driver, "//meta[@property='og:video:height'][1]", "content"))
dict['thumbnail'] = self.try_find_element_attribute(driver, "//meta[@property='og:image'][1]", "content")
desc_block = driver.find_elements_by_xpath("//ul[@class='description']/li/a")
dict['categories'] = [desc_block[0].text] # There is only one category per movie
dict['uploader_id'] = desc_block[-1].text
dict['uploader_url'] = desc_block[-1].get_attribute("href")
except:
self.to_screen("Exception while getting extra info")
pass
driver.quit()
if be_verbose:
self.to_screen("Will return the following data")
self.to_screen( dict )
return dict

View File

@ -4,6 +4,7 @@ import os.path
import optparse
import re
import sys
import platform
from .downloader.external import list_external_downloaders
from .compat import (
@ -445,7 +446,16 @@ def parseOpts(overrideArguments=None):
default=[], callback=_comma_separated_values_options_callback,
help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags')
default_driver = 'chrome' #Default webdriver is Chrome, except on the Mac
if platform.system()=="Darwin":
default_driver = 'safari'
downloader = optparse.OptionGroup(parser, 'Download Options')
downloader.add_option(
'--browser',
action='store', dest='web_driver', choices=['safari', 'chrome', 'firefox', 'edge', 'ie', 'opera', 'webkitgtk', 'android'],
default=default_driver,
help='Browser to use for websites requiring interaction. See Selenium for more information.')
downloader.add_option(
'-r', '--limit-rate', '--rate-limit',
dest='ratelimit', metavar='RATE',