mirror of
https://github.com/l1ving/youtube-dl
synced 2025-03-10 06:47:14 +08:00
[thisvid] Add extractor
This commit is contained in:
parent
695720ebe8
commit
23c4fb5e3e
@ -1162,6 +1162,10 @@ from .theweatherchannel import TheWeatherChannelIE
|
|||||||
from .thisamericanlife import ThisAmericanLifeIE
|
from .thisamericanlife import ThisAmericanLifeIE
|
||||||
from .thisav import ThisAVIE
|
from .thisav import ThisAVIE
|
||||||
from .thisoldhouse import ThisOldHouseIE
|
from .thisoldhouse import ThisOldHouseIE
|
||||||
|
from .thisvid import (
|
||||||
|
ThisVidIE,
|
||||||
|
ThisVidEmbeddedIE
|
||||||
|
)
|
||||||
from .threeqsdn import ThreeQSDNIE
|
from .threeqsdn import ThreeQSDNIE
|
||||||
from .tiktok import (
|
from .tiktok import (
|
||||||
TikTokIE,
|
TikTokIE,
|
||||||
|
132
youtube_dl/extractor/thisvid.py
Normal file
132
youtube_dl/extractor/thisvid.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
from .openload import PhantomJSwrapper
|
||||||
|
|
||||||
|
|
||||||
|
class ThisVidIE(InfoExtractor):
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/videos/(?P<id>[A-Za-z0-9-]+)'
|
||||||
|
|
||||||
|
_TEST = {
|
||||||
|
'url': 'https://thisvid.com/videos/madonna-show-in-sexy-underwear/',
|
||||||
|
'md5': '48e38730d38394c6e9f1cce66fb04c6e',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '829503',
|
||||||
|
'display_id': 'madonna-show-in-sexy-underwear',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Madonna show in sexy underwear',
|
||||||
|
'thumbnail': r're:^https?://.*preview\.mp4\.jpg$',
|
||||||
|
'uploader_id': 'Mike_Hunt',
|
||||||
|
'uploader_url': 'https://thisvid.com/members/584768',
|
||||||
|
'age_limit': 18,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
display_id = self._match_id(url)
|
||||||
|
webpage = self._download_webpage(url, display_id)
|
||||||
|
|
||||||
|
# The webpage contains a raw piece of javascript which creates a
|
||||||
|
# variable called flashvars used by the player to update the webpage.
|
||||||
|
#
|
||||||
|
# Running the javascript fixes the URL in the static HTML which is
|
||||||
|
# broken, and updates the flashvars variable with the new info.
|
||||||
|
#
|
||||||
|
# Because there are a ton of errors from PhantomJS that do not affect
|
||||||
|
# the output, they have to be split from the actual JSON.
|
||||||
|
jscode = """
|
||||||
|
function checkFlashVars() {
|
||||||
|
flashvars = page.evaluate(function() {
|
||||||
|
return JSON.stringify(flashvars)
|
||||||
|
});
|
||||||
|
console.log('---'); // ensure any errors appear above where we are
|
||||||
|
console.log(flashvars);
|
||||||
|
saveAndExit();
|
||||||
|
}
|
||||||
|
checkFlashVars();"""
|
||||||
|
phantom = PhantomJSwrapper(self, required_version='2.0')
|
||||||
|
webpage, output = phantom.get(url, html=webpage, jscode=jscode)
|
||||||
|
flashvars = self._parse_json(output.split("---", 2)[1], display_id)
|
||||||
|
|
||||||
|
# Get the video URL from the flashvars.
|
||||||
|
video_url = flashvars['video_url']
|
||||||
|
|
||||||
|
# The value in the static HTML starts with "function/0/http://..."
|
||||||
|
# where the zero is sometimes another number.
|
||||||
|
#
|
||||||
|
# At try that static URL if there was a static update failure.
|
||||||
|
if video_url.startswith('function'):
|
||||||
|
self.report_warning('Page JS failed, fetch will likely fail')
|
||||||
|
video_url = video_url.split("/", 3)[2]
|
||||||
|
|
||||||
|
# Sometimes the video url ends with ".mp4",
|
||||||
|
# other times it ends with ".mp4/",
|
||||||
|
# yet other times it ends with ".mp4/?".
|
||||||
|
#
|
||||||
|
# All of it needs to be cleaned up.
|
||||||
|
video_url = video_url.split("?", 2)[0].strip("/")
|
||||||
|
|
||||||
|
# Get the thumbnail URL from the flashvars.
|
||||||
|
thumbnail_url = flashvars['preview_url']
|
||||||
|
|
||||||
|
# The thumbnail usually does not have a protocol on the front, e.g.
|
||||||
|
# "//media.thisvid.com"
|
||||||
|
if thumbnail_url.startswith("//"):
|
||||||
|
thumbnail_url = 'https:' + thumbnail_url
|
||||||
|
|
||||||
|
# The simplest way to get the real internal ID is to get it from the
|
||||||
|
# URL we will be accessing.
|
||||||
|
video_id = video_url.split("/")[-2]
|
||||||
|
|
||||||
|
# Parse the title information.
|
||||||
|
title = self._search_regex(r'<title>(?P<title>.+) -([a-zA-Z ]+ at)? ThisVid(\.com| tube)</title>',
|
||||||
|
webpage, display_id, group='title')
|
||||||
|
|
||||||
|
# Parse the author information from a profile link.
|
||||||
|
author_re = r'<a.*class="author" href="(?P<url>[^"]+)"[^>]*>(?P<id>[^<]+)</a'
|
||||||
|
uploader_id = self._search_regex(author_re, webpage, display_id,
|
||||||
|
group='id')
|
||||||
|
uploader_url = self._search_regex(author_re, webpage, display_id,
|
||||||
|
group='url').strip("/")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'display_id': display_id,
|
||||||
|
'title': title,
|
||||||
|
'url': video_url,
|
||||||
|
'uploader_id': uploader_id,
|
||||||
|
'uploader_url': uploader_url,
|
||||||
|
'thumbnail': thumbnail_url,
|
||||||
|
'age_limit': 18,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ThisVidEmbeddedIE(ThisVidIE):
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/embed/(?P<id>[A-Za-z0-9-]+)'
|
||||||
|
_TEST = {
|
||||||
|
'url': 'https://thisvid.com/embed/854312',
|
||||||
|
'md5': '8166497c0281b54a48b179c997463892',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '854312',
|
||||||
|
'display_id': 'soles-of-jaxwheeler',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Soles of JaxWheeler',
|
||||||
|
'thumbnail': r're:^https?://.*preview\.mp4\.jpg$',
|
||||||
|
'uploader_id': 'SNK13',
|
||||||
|
'uploader_url': 'https://thisvid.com/members/252887',
|
||||||
|
'age_limit': 18,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
video_id = self._match_id(url)
|
||||||
|
embedded_page = self._download_webpage(url, video_id)
|
||||||
|
full_url = self._search_regex(
|
||||||
|
r'<link rel="canonical" href="(?P<href>[^"]+)"',
|
||||||
|
embedded_page, video_id, group='href')
|
||||||
|
video_id = full_url.strip("/").rsplit("/", 1)[-1]
|
||||||
|
return self.url_result(full_url, video_id=video_id)
|
Loading…
x
Reference in New Issue
Block a user