1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-09 23:17:22 +08:00

[thisvid] Add extractor

This commit is contained in:
jhwgh1968 2019-06-14 21:22:27 -05:00
parent 695720ebe8
commit 23c4fb5e3e
2 changed files with 136 additions and 0 deletions

View File

@ -1162,6 +1162,10 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .thisvid import (
ThisVidIE,
ThisVidEmbeddedIE
)
from .threeqsdn import ThreeQSDNIE
from .tiktok import (
TikTokIE,

View File

@ -0,0 +1,132 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from .openload import PhantomJSwrapper
class ThisVidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<id>[A-Za-z0-9-]+)'
_TEST = {
'url': 'https://thisvid.com/videos/madonna-show-in-sexy-underwear/',
'md5': '48e38730d38394c6e9f1cce66fb04c6e',
'info_dict': {
'id': '829503',
'display_id': 'madonna-show-in-sexy-underwear',
'ext': 'mp4',
'title': 'Madonna show in sexy underwear',
'thumbnail': r're:^https?://.*preview\.mp4\.jpg$',
'uploader_id': 'Mike_Hunt',
'uploader_url': 'https://thisvid.com/members/584768',
'age_limit': 18,
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
# The webpage contains a raw piece of javascript which creates a
# variable called flashvars used by the player to update the webpage.
#
# Running the javascript fixes the URL in the static HTML which is
# broken, and updates the flashvars variable with the new info.
#
# Because there are a ton of errors from PhantomJS that do not affect
# the output, they have to be split from the actual JSON.
jscode = """
function checkFlashVars() {
flashvars = page.evaluate(function() {
return JSON.stringify(flashvars)
});
console.log('---'); // ensure any errors appear above where we are
console.log(flashvars);
saveAndExit();
}
checkFlashVars();"""
phantom = PhantomJSwrapper(self, required_version='2.0')
webpage, output = phantom.get(url, html=webpage, jscode=jscode)
flashvars = self._parse_json(output.split("---", 2)[1], display_id)
# Get the video URL from the flashvars.
video_url = flashvars['video_url']
# The value in the static HTML starts with "function/0/http://..."
# where the zero is sometimes another number.
#
# At try that static URL if there was a static update failure.
if video_url.startswith('function'):
self.report_warning('Page JS failed, fetch will likely fail')
video_url = video_url.split("/", 3)[2]
# Sometimes the video url ends with ".mp4",
# other times it ends with ".mp4/",
# yet other times it ends with ".mp4/?".
#
# All of it needs to be cleaned up.
video_url = video_url.split("?", 2)[0].strip("/")
# Get the thumbnail URL from the flashvars.
thumbnail_url = flashvars['preview_url']
# The thumbnail usually does not have a protocol on the front, e.g.
# "//media.thisvid.com"
if thumbnail_url.startswith("//"):
thumbnail_url = 'https:' + thumbnail_url
# The simplest way to get the real internal ID is to get it from the
# URL we will be accessing.
video_id = video_url.split("/")[-2]
# Parse the title information.
title = self._search_regex(r'<title>(?P<title>.+) -([a-zA-Z ]+ at)? ThisVid(\.com| tube)</title>',
webpage, display_id, group='title')
# Parse the author information from a profile link.
author_re = r'<a.*class="author" href="(?P<url>[^"]+)"[^>]*>(?P<id>[^<]+)</a'
uploader_id = self._search_regex(author_re, webpage, display_id,
group='id')
uploader_url = self._search_regex(author_re, webpage, display_id,
group='url').strip("/")
return {
'id': video_id,
'display_id': display_id,
'title': title,
'url': video_url,
'uploader_id': uploader_id,
'uploader_url': uploader_url,
'thumbnail': thumbnail_url,
'age_limit': 18,
}
class ThisVidEmbeddedIE(ThisVidIE):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/embed/(?P<id>[A-Za-z0-9-]+)'
_TEST = {
'url': 'https://thisvid.com/embed/854312',
'md5': '8166497c0281b54a48b179c997463892',
'info_dict': {
'id': '854312',
'display_id': 'soles-of-jaxwheeler',
'ext': 'mp4',
'title': 'Soles of JaxWheeler',
'thumbnail': r're:^https?://.*preview\.mp4\.jpg$',
'uploader_id': 'SNK13',
'uploader_url': 'https://thisvid.com/members/252887',
'age_limit': 18,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
embedded_page = self._download_webpage(url, video_id)
full_url = self._search_regex(
r'<link rel="canonical" href="(?P<href>[^"]+)"',
embedded_page, video_id, group='href')
video_id = full_url.strip("/").rsplit("/", 1)[-1]
return self.url_result(full_url, video_id=video_id)