mirror of
https://github.com/l1ving/youtube-dl
synced 2025-02-13 18:02:52 +08:00
• Selenium requires the installation of a free webdriver (see https://pypi.python.org/pypi/selenium ), except for Safari 10+ • Added new option --browser in the command line to indicate the desired browser. It defaults to Safari on the Mac and Chrome elsewhere. NOTE that I have only tested Safari and Chrome until now, though Selenium is compatible with 'firefox', 'edge', 'ie', 'opera', 'webkitgtk', 'android' • ThisVid can detect several common errors like a deleted file or insufficient privileges • ThisVid does not support authentication yet
170 lines
6.5 KiB
Python
Executable File
170 lines
6.5 KiB
Python
Executable File
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from ..utils import (
|
|
ExtractorError,
|
|
)
|
|
import re
|
|
|
|
from .common import InfoExtractor
|
|
|
|
# Process downloads from porn site ThisVid.com
|
|
# Requires Selenium (license Apache 2.0)
|
|
|
|
class thisvidIE(InfoExtractor):
|
|
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<display_id>\w+)'
|
|
_TEST = {
|
|
'url': 'https://thisvid.com/videos/final-impact-3/',
|
|
'md5': '8302bd736a1e4198ed80db4a0d0dd012',
|
|
'info_dict': {
|
|
'id': '490698',
|
|
'ext': 'mp4',
|
|
'title': 'Final impact_3',
|
|
}
|
|
}
|
|
|
|
def try_find_element_attribute(self, driver, xpath, attr):
|
|
# If attr is "_text_", get element text instead
|
|
try:
|
|
if attr == "_text_":
|
|
return driver.find_element_by_xpath( xpath ).text
|
|
else:
|
|
return driver.find_element_by_xpath( xpath ).get_attribute( attr )
|
|
except:
|
|
return None
|
|
|
|
def try_find_elements_attribute(self, driver, xpath, attr):
|
|
# Same as try_find_element_attribute but for a list
|
|
try:
|
|
results = []
|
|
objs = driver.find_elements_by_xpath( xpath )
|
|
|
|
if attr == "_text_":
|
|
for t in objs:
|
|
results.append(t.text)
|
|
else:
|
|
for t in objs:
|
|
results.append(t.get_attribute(attr))
|
|
|
|
return results
|
|
except:
|
|
return None
|
|
|
|
def _real_extract(self, url):
|
|
dict = { 'age_limit': 18,
|
|
'ext': 'mp4',
|
|
'extractor': 'ThisVid'}
|
|
|
|
be_verbose = self._downloader.params.get('verbose', False)
|
|
|
|
mobj = re.match(self._VALID_URL, url)
|
|
display_id = mobj.group('display_id')
|
|
error_msg = ""
|
|
driver = None
|
|
source = ""
|
|
wanted_driver = self._downloader.params.get('web_driver')
|
|
w_driver = getattr(webdriver, wanted_driver.capitalize())
|
|
if be_verbose:
|
|
self.to_screen('Extractor started. Connecting to the selected browser (%s)' % wanted_driver)
|
|
|
|
try:
|
|
driver = w_driver()
|
|
driver.get(url)
|
|
source = driver.page_source
|
|
except:
|
|
error_msg = "Could not connect to the webdriver. Make sure you have installed the webdriver for '%s'" % wanted_driver
|
|
pass
|
|
|
|
# Check common errors
|
|
if "SORRY, THE FILE DOES NOT EXIST YET" in source:
|
|
# Error says 'yet' but removed files can also cause that error
|
|
error_msg = "This file does not exist on Thisvid.com"
|
|
elif "Sorry, this file was deleted" in source:
|
|
error_msg = "This file has been deleted from Thisvid.com"
|
|
elif "This video is a private video" in source:
|
|
error_msg = "This video is private"
|
|
|
|
if error_msg:
|
|
if driver is not None:
|
|
driver.quit()
|
|
raise ExtractorError(
|
|
'ThisVid said: %s' % error_msg,
|
|
expected=True)
|
|
|
|
# Click the Play button
|
|
content = driver.find_element_by_class_name('fp-play')
|
|
if content is not None:
|
|
content.click()
|
|
else:
|
|
driver.quit()
|
|
error_msg = "Page does not contain expected data"
|
|
raise ExtractorError(
|
|
'ThisVid said: %s' % error_msg,
|
|
expected=True)
|
|
|
|
try: # Until the true URL appears in the DOM
|
|
element = WebDriverWait(driver, 30).until( EC.presence_of_element_located((By.CLASS_NAME, "fp-engine")))
|
|
except:
|
|
error_msg = "Browser timed out"
|
|
driver.quit()
|
|
raise ExtractorError(
|
|
'ThisVid said: %s' % error_msg,
|
|
expected=True)
|
|
|
|
og_url = self.try_find_element_attribute( driver, "//meta[@property='og:video:url'][1]", "content" )
|
|
hits = re.findall(r'/([0-9]+)/', og_url)
|
|
video_id = hits[0]
|
|
dict['id'] = video_id
|
|
if be_verbose:
|
|
self.to_screen("Found video id %s" % dict['id'])
|
|
|
|
dict['url'] = self.try_find_element_attribute( driver, "//video[@class='fp-engine'][1]", "src" )
|
|
|
|
video_title = self.try_find_element_attribute( driver, "//meta[@property='og:title'][1]", "content" )
|
|
video_title = video_title[:len(video_title)-14] # Strip the last " - ThisVid.com" from the title
|
|
dict['title'] = video_title
|
|
|
|
# Gather other data
|
|
try:
|
|
elem_names = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title']", "_text_" )
|
|
elem_values = self.try_find_elements_attribute( driver, "//ul[@class='tools-left']/li/span[@class='title-description']", "_text_" )
|
|
hits = re.findall(r'Rating:\s([0-9]\.[0-9])', elem_names[0])
|
|
video_rating = hits[0]
|
|
|
|
dict['view_count'] = int( elem_values[0] )
|
|
dict['release_date'] = elem_values[1]
|
|
|
|
hits = re.findall(r'([0-9]+):([0-9]+)', elem_values[2])
|
|
video_duration_in_seconds = int(hits[0][0])*60 + int(hits[0][1])
|
|
dict['duration'] = video_duration_in_seconds
|
|
|
|
dict['description'] = self.try_find_element_attribute(driver, "//meta[@property='og:description'][1]",
|
|
"content")
|
|
dict['tags'] = self.try_find_elements_attribute(driver, "//meta[@property='og:video:tag']", "content")
|
|
dict['width'] = int(
|
|
self.try_find_element_attribute(driver, "//meta[@property='og:video:width'][1]", "content"))
|
|
dict['height'] = int(
|
|
self.try_find_element_attribute(driver, "//meta[@property='og:video:height'][1]", "content"))
|
|
dict['thumbnail'] = self.try_find_element_attribute(driver, "//meta[@property='og:image'][1]", "content")
|
|
|
|
desc_block = driver.find_elements_by_xpath("//ul[@class='description']/li/a")
|
|
|
|
dict['categories'] = [desc_block[0].text] # There is only one category per movie
|
|
dict['uploader_id'] = desc_block[-1].text
|
|
dict['uploader_url'] = desc_block[-1].get_attribute("href")
|
|
|
|
except:
|
|
self.to_screen("Exception while getting extra info")
|
|
pass
|
|
|
|
driver.quit()
|
|
|
|
if be_verbose:
|
|
self.to_screen("Will return the following data")
|
|
self.to_screen( dict )
|
|
|
|
return dict
|