mirror of
https://github.com/l1ving/youtube-dl
synced 2025-03-13 22:07:17 +08:00
337 lines
14 KiB
Python
337 lines
14 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import re
|
|
|
|
from .common import InfoExtractor
|
|
from ..utils import (
|
|
clean_html,
|
|
js_to_json,
|
|
ExtractorError,
|
|
compat_parse_qs,
|
|
compat_urllib_parse_urlparse,
|
|
compat_urllib_parse,
|
|
compat_urllib_request
|
|
)
|
|
|
|
|
|
class RoosterteethShowIE(InfoExtractor):
|
|
_VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/show/(?P<id>[^/]+)(?:/season)?'
|
|
_TESTS = [{
|
|
'url': 'http://roosterteeth.com/show/screen-play',
|
|
'info_dict': {
|
|
'id': 'screen-play',
|
|
'description': 'A Rooster Teeth podcast focusing on all things Film and TV. Listen to our pop culture geeks chat about TV premieres and finales, blockbuster franchises, indie darlings, casting rumors and spotlight a film to discuss in their weekly "Movie Book Club" segment. So pop some popcorn, grab a good seat and enjoy the show.',
|
|
'title': 'Screen Play',
|
|
},
|
|
'playlist_count': 23
|
|
}, {
|
|
'url': 'http://roosterteeth.com/show/red-vs-blue#;season=.* 1$',
|
|
'info_dict': {
|
|
'id': 'red-vs-blue',
|
|
'description': 'In the distant future, two groups of soldiers battle for control of the least desirable piece of real estate in the known universe - a box canyon in the middle of nowhere.',
|
|
'title': 'Red vs. Blue',
|
|
},
|
|
'playlist_count': 24
|
|
}, {
|
|
'url': 'http://roosterteeth.com/show/red-vs-blue',
|
|
'info_dict': {
|
|
'id': 'red-vs-blue',
|
|
'description': 'In the distant future, two groups of soldiers battle for control of the least desirable piece of real estate in the known universe - a box canyon in the middle of nowhere.',
|
|
'title': 'Red vs. Blue',
|
|
},
|
|
|
|
'playlist_mincount': 380
|
|
}]
|
|
|
|
def _real_extract(self, url):
|
|
ep_filter = {}
|
|
|
|
if '#;' in url:
|
|
url, params = url.split('#;')
|
|
ep_filter = compat_parse_qs(params)
|
|
|
|
playlist_id = self._match_id(url)
|
|
html = self._download_webpage(url, playlist_id)
|
|
|
|
title = self._html_search_regex(r'<div class="show-header">\s*<h1>([^<]+)</h1>\s*</div>', html, 'show title')
|
|
description = self._html_search_regex(r'<section class="show-details">((?:[^<]|<(?!/section>))+)</section>', html, 'show description')
|
|
|
|
start_piece = "<div id='tab-content-episodes' class='tab-content'>"
|
|
start = html.find(start_piece)
|
|
if start == -1:
|
|
raise ExtractorError("Can't find the episodes!")
|
|
|
|
html = html[start + len(start_piece):].lstrip()
|
|
sections = []
|
|
if html.startswith('<ul class='):
|
|
# This show doesn't have seasons AKA sections.
|
|
end = html.find('</ul>')
|
|
if end == -1:
|
|
raise ExtractorError("Can't find the end of the episode list!")
|
|
|
|
sections = [(None, html[:end])]
|
|
else:
|
|
# We have to extract the sections.
|
|
end = html.find('</article></section></section>')
|
|
if end == -1:
|
|
raise ExtractorError("Can't find the end of the section list!")
|
|
|
|
html = html[:end]
|
|
HEADER_RE = re.compile(r"<h3 class='title' id='header-[^']+'>([^<]+)</h3>")
|
|
|
|
# Process sections / seasons
|
|
for section in html.split('</section>'):
|
|
sec_title = self._html_search_regex(HEADER_RE, section, 'season title')
|
|
start = section.find("<ul class='episode-blocks'>")
|
|
end = section.find("</ul>", start)
|
|
|
|
if start < 0 or end < 0:
|
|
raise ExtractorError("Couldn't parse season %s! (%s)" % (sec_title, playlist_id))
|
|
|
|
sections.append((sec_title, section[start:end]))
|
|
|
|
results = []
|
|
EP_RE = re.compile(r'<a href="(?P<url>[^"]+)">(?:[^<]|<(?!p class="name"))+<p class="name">(?P<title>[^<]+)</p>\s*</a>')
|
|
|
|
for sec_title, part in reversed(sections):
|
|
episodes = part.split('</li>')
|
|
for ep_part in episodes:
|
|
if ep_part.strip() == '':
|
|
continue
|
|
|
|
ep = EP_RE.search(ep_part)
|
|
if not ep:
|
|
raise ExtractorError("Failed to parse an episode of season %s! (%s, %s)" % (sec_title or '0', playlist_id, ep_part))
|
|
|
|
url = clean_html(ep.group('url'))
|
|
if sec_title:
|
|
# Pass the season title to the video extractor.
|
|
url += '#;' + compat_urllib_parse.urlencode({'season': sec_title})
|
|
res = self.url_result(url, 'Roosterteeth')
|
|
res['season'] = sec_title
|
|
else:
|
|
res = self.url_result(url, 'Roosterteeth')
|
|
|
|
if self._match_filter(res, ep_filter):
|
|
results.append(res)
|
|
|
|
if len(sections) == 1 and sections[0][0] is None:
|
|
# If the page didn't contain sections, then the episodes are in reverse order.
|
|
results = list(reversed(results))
|
|
|
|
return self.playlist_result(results, playlist_id, title, description)
|
|
|
|
def _match_filter(self, item, filter_rules):
|
|
for k, v in filter_rules.items():
|
|
if isinstance(v, list) and len(v) > 1:
|
|
# A list of acceptable values
|
|
if item.get(k) not in v:
|
|
return False
|
|
else:
|
|
if not re.match(v[0], item.get(k)):
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
class RoosterteethIE(InfoExtractor):
|
|
_VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/episode/(?P<id>[^/]+)'
|
|
_TESTS = [
|
|
{
|
|
'params': {
|
|
# Without this parameter ytdl downloads the whole file.
|
|
'hls_prefer_native': True
|
|
},
|
|
|
|
'url': 'http://achievementhunter.com/episode/rage-quit-season-1-episode-199',
|
|
'md5': '828fe30ccdddf5d85e444e33686d531a',
|
|
'info_dict': {
|
|
'id': 'rage-quit-season-1-episode-199',
|
|
'ext': 'mp4',
|
|
'title': 'Rage Quit - No Time to Explain',
|
|
'description': 'There\'s no time to explain this video.',
|
|
'thumbnail': r're:^http://s3\.amazonaws\.com/cdn\.roosterteeth\.com/uploads/images/[a-f0-9-]+/md/[a-z0-9-]+\.jpeg$',
|
|
'protocol': 'm3u8',
|
|
'url': r're:^http://[a-zA-Z0-9.]+\.taucdn\.net/[0-9a-zA-Z]+/video/uploads/videos/[0-9a-f-]+/[0-9A-Z]+\.m3u8$',
|
|
}
|
|
},
|
|
{
|
|
'url': 'http://roosterteeth.com/episode/red-vs-blue-season-1-episode-1',
|
|
'md5': '80277833f3ed946b553d13cf8e27443d',
|
|
'info_dict': {
|
|
'id': 'red-vs-blue-season-1-episode-1',
|
|
'ext': 'mp4',
|
|
|
|
'title': 'Why Are We Here? - Episode 1 - Red vs. Blue Season 1',
|
|
'thumbnail': r're:^https://i\.ytimg\.com/vi/[0-9a-zA-Z]+/maxresdefault\.jpg$',
|
|
'url': r're:^https://[0-9a-z-]+\.googlevideo\.com/videoplayback',
|
|
|
|
'upload_date': '20150306',
|
|
'uploader_id': 'UCII0hP2Ycmhh5j8lS4cexBQ',
|
|
'uploader': 'Red vs. Blue',
|
|
'description': 'The first episode of Red vs. Blue introduces the main characters, and poses the all-important question, why are we here?'
|
|
}
|
|
}
|
|
]
|
|
_NETRC_MACHINE = 'roosterteeth'
|
|
_authed = None
|
|
_sponsor = None
|
|
|
|
def _real_initialize(self):
|
|
self._authed = {}
|
|
|
|
def _real_extract(self, url):
|
|
if '#;' in url:
|
|
url, params = url.split('#;')
|
|
params = compat_parse_qs(params)
|
|
else:
|
|
params = {}
|
|
|
|
video_id = self._match_id(url)
|
|
html = self._download_webpage(url, video_id)
|
|
|
|
if html.find('Unfortunately, this is sponsor-only.') > -1:
|
|
domain = compat_urllib_parse_urlparse(url).netloc
|
|
release = re.search(r'<p>[^<]+ Releases ([0-9]+ [a-zA-Z]+) from now</p>', html)
|
|
if release:
|
|
release = ' The video will be public in %s.' % release.group(1)
|
|
else:
|
|
release = ''
|
|
|
|
if not self._login(domain):
|
|
raise ExtractorError("This video is sponsor-only. You didn't provide your credentials or the login failed.%s" % release, expected=True)
|
|
|
|
# Try again.
|
|
html = self._download_webpage(url, video_id)
|
|
if html.find('Unfortunately, this is sponsor-only.') > -1:
|
|
if not self._is_sponsor(domain):
|
|
raise ExtractorError('This video is sponsor-only but you are not a sponsor.%s' % release, expected=True)
|
|
else:
|
|
raise ExtractorError('This is a sponsor-only video and although I tried to login, it did not work.')
|
|
|
|
js = self._html_search_regex(r'<script src="https?://roosterteeth\.com/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info')
|
|
info = re.search(r'RT\.(?P<player>youtube|jwplayer)\.player\((?P<json>\{(?:[^}]|\}(?!\);))+\})\);', js)
|
|
if not info:
|
|
raise ExtractorError("Can't parse the video metadata! (%s)" % js)
|
|
|
|
player = info.group('player')
|
|
meta = self._parse_json(js_to_json(info.group('json')), video_id)
|
|
if player == 'jwplayer':
|
|
# Make sure that all values are there.
|
|
for attr in ('containerId', 'videoImage', 'videoTitle', 'manifest'):
|
|
if attr not in meta:
|
|
raise ExtractorError('Unexpected video info! Attribute %s is missing.' % attr)
|
|
|
|
video_image = meta['videoImage']
|
|
if video_image.startswith('//'):
|
|
video_image = 'http:' + video_image
|
|
|
|
res = {
|
|
'id': video_id,
|
|
'title': meta['videoTitle'].strip(),
|
|
'formats': self._extract_m3u8_formats(meta['manifest'], video_id, ext='mp4'),
|
|
'thumbnail': video_image
|
|
}
|
|
elif player == 'youtube':
|
|
if 'youtubeKey' not in meta:
|
|
raise ExtractorError('Invalid metadata for youtube video!')
|
|
|
|
res = self.url_result('https://youtube.com/watch?v=' + meta['youtubeKey'])
|
|
res['_type'] = 'url_transparent'
|
|
res['id'] = video_id
|
|
else:
|
|
raise ExtractorError('Unknown player type %s!' % player)
|
|
|
|
if 'season' in params:
|
|
res['season'] = params['season'][0]
|
|
|
|
desc = self._og_search_description(html)
|
|
if desc:
|
|
res['description'] = desc.strip()
|
|
|
|
return res
|
|
|
|
def _login(self, domain='roosterteeth.com'):
|
|
"""
|
|
Attempt to log in to RoosterTeeth (or Achievement Hunter).
|
|
NOTE: RT is planning to implement SSO which will probably change how this works.
|
|
"""
|
|
|
|
if domain in self._authed:
|
|
return self._authed[domain]
|
|
|
|
(username, password) = self._get_login_info()
|
|
|
|
# No authentication to be performed
|
|
if username is None:
|
|
return False
|
|
|
|
LOGIN_URL = 'http://%s/login' % domain
|
|
login_page, hdl = self._download_webpage_handle(
|
|
LOGIN_URL, None,
|
|
note='Downloading login page',
|
|
errnote='unable to fetch login page', fatal=False)
|
|
|
|
if login_page is False:
|
|
return False
|
|
|
|
if hdl.geturl() != LOGIN_URL:
|
|
# We were redirected which means that we're already logged in.
|
|
self._authed[domain] = True
|
|
return True
|
|
|
|
token = self._search_regex(r'(?s)<input.+?name="_token".+?value="(.+?)"',
|
|
login_page, 'Login token')
|
|
|
|
# Log in
|
|
login_form_strs = {
|
|
'_token': token,
|
|
'username': username,
|
|
'password': password
|
|
}
|
|
|
|
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
|
|
# chokes on unicode
|
|
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
|
|
login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
|
|
|
|
req = compat_urllib_request.Request(LOGIN_URL, login_data, {'Content-Type': 'application/x-www-form-urlencoded'})
|
|
login_results = self._download_webpage(
|
|
req, None,
|
|
note='Logging in', errnote='unable to log in', fatal=False)
|
|
|
|
if login_results is False:
|
|
return False
|
|
|
|
if login_results.find('Error in exception handler.') > -1 or login_results.find('Authentication failed. Please check and try again, or reset your password') > -1:
|
|
self.report_warning('unable to log in: bad username or password')
|
|
self._authed[domain] = False
|
|
return False
|
|
|
|
self._authed[domain] = True
|
|
return True
|
|
|
|
def _is_sponsor(self, domain='roosterteeth.com'):
|
|
if self._sponsor is None:
|
|
username, _ = self._get_login_info()
|
|
profile_page = 'http://%s/user/%s' % (domain, compat_urllib_parse.quote(username))
|
|
html = self._download_webpage(
|
|
profile_page, None,
|
|
note='Checking user profile...',
|
|
errnote='unable to access user profile', fatal=False)
|
|
|
|
if not html:
|
|
return False
|
|
|
|
user_info = self._search_regex(
|
|
r'<div class="sidebar-profile-header">\s*<p[^>]+>\s*<a href="%s">[^<]+</a>\s*<span>((?:[^<]|<(?!/span>))+)</span>' % (profile_page),
|
|
html, 'user status', fatal=False)
|
|
|
|
if not user_info:
|
|
return False
|
|
|
|
self._sponsor = '<i class="icon ion-star"></i>' in user_info
|
|
|
|
return self._sponsor
|