mirror of
https://github.com/l1ving/youtube-dl
synced 2025-02-03 01:05:44 +08:00
• New extractor for porn site ThisVid. By contrast with other extractors, it requires JavaScript in order to determine the video URL. That is why it uses Selenium (license Apache 2). The Generic extractor was already capable to download many of the videos but only in 240p.
• Selenium requires the installation of a free webdriver (see https://pypi.python.org/pypi/selenium ), except for Safari 10+ • Added new option --browser in the command line to indicate the desired browser. It defaults to Safari on the Mac and Chrome elsewhere. NOTE that I have only tested Safari and Chrome until now, though Selenium is compatible with 'firefox', 'edge', 'ie', 'opera', 'webkitgtk', 'android' • ThisVid can detect several common errors like a deleted file or insufficient privileges • ThisVid does not support authentication yet
This commit is contained in:
parent
5634bbfe7f
commit
39cd85c27e
@ -433,6 +433,7 @@ def _real_main(argv=None):
|
||||
# just for deprecation check
|
||||
'autonumber': opts.autonumber if opts.autonumber is True else None,
|
||||
'usetitle': opts.usetitle if opts.usetitle is True else None,
|
||||
'web_driver': opts.web_driver,
|
||||
}
|
||||
|
||||
with YoutubeDL(ydl_opts) as ydl:
|
||||
|
@ -1079,6 +1079,7 @@ from .theweatherchannel import TheWeatherChannelIE
|
||||
from .thisamericanlife import ThisAmericanLifeIE
|
||||
from .thisav import ThisAVIE
|
||||
from .thisoldhouse import ThisOldHouseIE
|
||||
from .thisvid import thisvidIE
|
||||
from .threeqsdn import ThreeQSDNIE
|
||||
from .tinypic import TinyPicIE
|
||||
from .tmz import (
|
||||
|
169
youtube_dl/extractor/thisvid.py
Executable file
169
youtube_dl/extractor/thisvid.py
Executable file
@ -0,0 +1,169 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from ..utils import (
|
||||
ExtractorError,
|
||||
)
|
||||
import re
|
||||
|
||||
from .common import InfoExtractor
|
||||
|
||||
# Process downloads from porn site ThisVid.com
|
||||
# Requires Selenium (license Apache 2.0)
|
||||
|
||||
class thisvidIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<display_id>\w+)'
|
||||
_TEST = {
|
||||
'url': 'https://thisvid.com/videos/final-impact-3/',
|
||||
'md5': '8302bd736a1e4198ed80db4a0d0dd012',
|
||||
'info_dict': {
|
||||
'id': '490698',
|
||||
'ext': 'mp4',
|
||||
'title': 'Final impact_3',
|
||||
}
|
||||
}
|
||||
|
||||
def try_find_element_attribute(self, driver, xpath, attr):
|
||||
# If attr is "_text_", get element text instead
|
||||
try:
|
||||
if attr == "_text_":
|
||||
return driver.find_element_by_xpath( xpath ).text
|
||||
else:
|
||||
return driver.find_element_by_xpath( xpath ).get_attribute( attr )
|
||||
except:
|
||||
return None
|
||||
|
||||
def try_find_elements_attribute(self, driver, xpath, attr):
|
||||
# Same as try_find_element_attribute but for a list
|
||||
try:
|
||||
results = []
|
||||
objs = driver.find_elements_by_xpath( xpath )
|
||||
|
||||
if attr == "_text_":
|
||||
for t in objs:
|
||||
results.append(t.text)
|
||||
else:
|
||||
for t in objs:
|
||||
results.append(t.get_attribute(attr))
|
||||
|
||||
return results
|
||||
except:
|
||||
return None
|
||||
|
||||
def _real_extract(self, url):
|
||||
dict = { 'age_limit': 18,
|
||||
'ext': 'mp4',
|
||||
'extractor': 'ThisVid'}
|
||||
|
||||
be_verbose = self._downloader.params.get('verbose', False)
|
||||
|
||||
mobj = re.match(self._VALID_URL, url)
|
||||
display_id = mobj.group('display_id')
|
||||
error_msg = ""
|
||||
driver = None
|
||||
source = ""
|
||||
wanted_driver = self._downloader.params.get('web_driver')
|
||||
w_driver = getattr(webdriver, wanted_driver.capitalize())
|
||||
if be_verbose:
|
||||
self.to_screen('Extractor started. Connecting to the selected browser (%s)' % wanted_driver)
|
||||
|
||||
try:
|
||||
driver = w_driver()
|
||||
driver.get(url)
|
||||
source = driver.page_source
|
||||
except:
|
||||
error_msg = "Could not connect to the webdriver. Make sure you have installed the webdriver for '%s'" % wanted_driver
|
||||
pass
|
||||
|
||||
# Check common errors
|
||||
if "SORRY, THE FILE DOES NOT EXIST YET" in source:
|
||||
# Error says 'yet' but removed files can also cause that error
|
||||
error_msg = "This file does not exist on Thisvid.com"
|
||||
elif "Sorry, this file was deleted" in source:
|
||||
error_msg = "This file has been deleted from Thisvid.com"
|
||||
elif "This video is a private video" in source:
|
||||
error_msg = "This video is private"
|
||||
|
||||
if error_msg:
|
||||
if driver is not None:
|
||||
driver.quit()
|
||||
raise ExtractorError(
|
||||
'ThisVid said: %s' % error_msg,
|
||||
expected=True)
|
||||
|
||||
# Click the Play button
|
||||
content = driver.find_element_by_class_name('fp-play')
|
||||
if content is not None:
|
||||
content.click()
|
||||
else:
|
||||
driver.quit()
|
||||
error_msg = "Page does not contain expected data"
|
||||
raise ExtractorError(
|
||||
'ThisVid said: %s' % error_msg,
|
||||
expected=True)
|
||||
|
||||
try: # Until the true URL appears in the DOM
|
||||
element = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CLASS_NAME, "fp-engine")))
|
||||
except:
|
||||
error_msg = "Browser timed out"
|
||||
driver.quit()
|
||||
raise ExtractorError(
|
||||
'ThisVid said: %s' % error_msg,
|
||||
expected=True)
|
||||
|
||||
og_url = self.try_find_element_attribute( driver, "//meta[@property='og:video:url'][1]", "content" )
|
||||
hits = re.findall(r'/([0-9]+)/', og_url)
|
||||
video_id = hits[0]
|
||||
dict['id'] = video_id
|
||||
if be_verbose:
|
||||
self.to_screen("Found video id %s" % dict['id'])
|
||||
|
||||
dict['url'] = self.try_find_element_attribute( driver, "//video[@class='fp-engine'][1]", "src" )
|
||||
|
||||
video_title = self.try_find_element_attribute( driver, "//meta[@property='og:title'][1]", "content" )
|
||||
video_title = video_title[:len(video_title)-14] # Strip the last " - ThisVid.com" from the title
|
||||
dict['title'] = video_title
|
||||
|
||||
# Gather other data
|
||||
try:
|
||||
elem_names = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title']", "_text_" )
|
||||
elem_values = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title-description']", "_text_" )
|
||||
hits = re.findall(r'Rating:\s([0-9]\.[0-9])', elem_names[0])
|
||||
video_rating = hits[0]
|
||||
|
||||
dict['view_count'] = int( elem_values[0] )
|
||||
dict['release_date'] = elem_values[1]
|
||||
|
||||
hits = re.findall(r'([0-9]+):([0-9]+)', elem_values[2])
|
||||
video_duration_in_seconds = int(hits[0][0])*60 + int(hits[0][1])
|
||||
dict['duration'] = video_duration_in_seconds
|
||||
|
||||
dict['description'] = self.try_find_element_attribute(driver, "//meta[@property='og:description'][1]",
|
||||
"content")
|
||||
dict['tags'] = self.try_find_elements_attribute(driver, "//meta[@property='og:video:tag']", "content")
|
||||
dict['width'] = int(
|
||||
self.try_find_element_attribute(driver, "//meta[@property='og:video:width'][1]", "content"))
|
||||
dict['height'] = int(
|
||||
self.try_find_element_attribute(driver, "//meta[@property='og:video:height'][1]", "content"))
|
||||
dict['thumbnail'] = self.try_find_element_attribute(driver, "//meta[@property='og:image'][1]", "content")
|
||||
|
||||
desc_block = driver.find_elements_by_xpath("//ul[@class='description']/li/a")
|
||||
|
||||
dict['categories'] = [desc_block[0].text] # There is only one category per movie
|
||||
dict['uploader_id'] = desc_block[-1].text
|
||||
dict['uploader_url'] = desc_block[-1].get_attribute("href")
|
||||
|
||||
except:
|
||||
self.to_screen("Exception while getting extra info")
|
||||
pass
|
||||
|
||||
driver.quit()
|
||||
|
||||
if be_verbose:
|
||||
self.to_screen("Will return the following data")
|
||||
self.to_screen( dict )
|
||||
|
||||
return dict
|
@ -4,6 +4,7 @@ import os.path
|
||||
import optparse
|
||||
import re
|
||||
import sys
|
||||
import platform
|
||||
|
||||
from .downloader.external import list_external_downloaders
|
||||
from .compat import (
|
||||
@ -445,7 +446,16 @@ def parseOpts(overrideArguments=None):
|
||||
default=[], callback=_comma_separated_values_options_callback,
|
||||
help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags')
|
||||
|
||||
default_driver = 'chrome' #Default webdriver is Chrome, except on the Mac
|
||||
if platform.system()=="Darwin":
|
||||
default_driver = 'safari'
|
||||
|
||||
downloader = optparse.OptionGroup(parser, 'Download Options')
|
||||
downloader.add_option(
|
||||
'--browser',
|
||||
action='store', dest='web_driver', choices=['safari', 'chrome', 'firefox', 'edge', 'ie', 'opera', 'webkitgtk', 'android'],
|
||||
default=default_driver,
|
||||
help='Browser to use for websites requiring interaction. See Selenium for more information.')
|
||||
downloader.add_option(
|
||||
'-r', '--limit-rate', '--rate-limit',
|
||||
dest='ratelimit', metavar='RATE',
|
||||
|
Loading…
Reference in New Issue
Block a user