1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-15 02:17:19 +08:00

[roosterteeth] added

This commit is contained in:
ngld 2015-08-12 15:53:13 +02:00
parent 06c6efa970
commit 8b2e4e87e0
2 changed files with 339 additions and 0 deletions

View File

@ -498,6 +498,10 @@ from .restudy import RestudyIE
from .reverbnation import ReverbNationIE from .reverbnation import ReverbNationIE
from .ringtv import RingTVIE from .ringtv import RingTVIE
from .ro220 import Ro220IE from .ro220 import Ro220IE
from .roosterteeth import (
RoosterteethIE,
RoosterteethShowIE
)
from .rottentomatoes import RottenTomatoesIE from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE from .roxwel import RoxwelIE
from .rtbf import RTBFIE from .rtbf import RTBFIE

View File

@ -0,0 +1,335 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
js_to_json,
ExtractorError,
compat_urllib_parse_urlparse,
compat_urllib_parse,
compat_urllib_request
)
class RoosterteethShowIE(InfoExtractor):
_VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/show/(?P<id>[^/]+)(?:/season)?'
_TESTS = [{
'url': 'http://roosterteeth.com/show/screen-play',
'info_dict': {
'id': 'screen-play',
'description': 'A Rooster Teeth podcast focusing on all things Film and TV. Listen to our pop culture geeks chat about TV premieres and finales, blockbuster franchises, indie darlings, casting rumors and spotlight a film to discuss in their weekly "Movie Book Club" segment. So pop some popcorn, grab a good seat and enjoy the show.',
'title': 'Screen Play',
},
'playlist_count': 23
}, {
'url': 'http://roosterteeth.com/show/red-vs-blue#;season=.* 1$',
'info_dict': {
'id': 'red-vs-blue',
'description': 'In the distant future, two groups of soldiers battle for control of the least desirable piece of real estate in the known universe - a box canyon in the middle of nowhere.',
'title': 'Red vs. Blue',
},
'playlist_count': 24
}, {
'url': 'http://roosterteeth.com/show/red-vs-blue',
'info_dict': {
'id': 'red-vs-blue',
'description': 'In the distant future, two groups of soldiers battle for control of the least desirable piece of real estate in the known universe - a box canyon in the middle of nowhere.',
'title': 'Red vs. Blue',
},
'playlist_mincount': 380
}]
def _real_extract(self, url):
ep_filter = {}
if '#;' in url:
url, params = url.split('#;')
ep_filter = compat_urllib_parse.parse_qs(params)
playlist_id = self._match_id(url)
html = self._download_webpage(url, playlist_id)
title = self._html_search_regex(r'<div class="show-header">\s*<h1>([^<]+)</h1>\s*</div>', html, 'show title')
description = self._html_search_regex(r'<section class="show-details">((?:[^<]|<(?!/section>))+)</section>', html, 'show description')
start_piece = "<div id='tab-content-episodes' class='tab-content'>"
start = html.find(start_piece)
if start == -1:
raise ExtractorError("Can't find the episodes!")
html = html[start + len(start_piece):].lstrip()
sections = []
if html.startswith('<ul class='):
# This show doesn't have seasons AKA sections.
end = html.find('</ul>')
if end == -1:
raise ExtractorError("Can't find the end of the episode list!")
sections = [(None, html[:end])]
else:
# We have to extract the sections.
end = html.find('</article></section></section>')
if end == -1:
raise ExtractorError("Can't find the end of the section list!")
html = html[:end]
HEADER_RE = re.compile(r"<h3 class='title' id='header-[^']+'>([^<]+)</h3>")
# Process sections / seasons
for section in html.split('</section>'):
sec_title = self._html_search_regex(HEADER_RE, section, 'season title')
start = section.find("<ul class='episode-blocks'>")
end = section.find("</ul>", start)
if start < 0 or end < 0:
raise ExtractorError("Couldn't parse season %s! (%s)" % (sec_title, playlist_id))
sections.append((sec_title, section[start:end]))
results = []
EP_RE = re.compile(r'<a href="(?P<url>[^"]+)">(?:[^<]|<(?!p class="name"))+<p class="name">(?P<title>[^<]+)</p>\s*</a>')
for sec_title, part in reversed(sections):
episodes = part.split('</li>')
for ep_part in episodes:
if ep_part.strip() == '':
continue
ep = EP_RE.search(ep_part)
if not ep:
raise ExtractorError("Failed to parse an episode of season %s! (%s, %s)" % (sec_title or '0', playlist_id, ep_part))
url = clean_html(ep.group('url'))
if sec_title:
# Pass the season title to the video extractor.
url += '#;' + compat_urllib_parse.urlencode({'season': sec_title})
res = self.url_result(url, 'Roosterteeth')
res['season'] = sec_title
else:
res = self.url_result(url, 'Roosterteeth')
if self._match_filter(res, ep_filter):
results.append(res)
if len(sections) == 1 and sections[0][0] is None:
# If the page didn't contain sections, then the episodes are in reverse order.
results = list(reversed(results))
return self.playlist_result(results, playlist_id, title, description)
def _match_filter(self, item, filter_rules):
for k, v in filter_rules.items():
if isinstance(v, list) and len(v) > 1:
# A list of acceptable values
if item.get(k) not in v:
return False
else:
if not re.match(v[0], item.get(k)):
return False
return True
class RoosterteethIE(InfoExtractor):
_VALID_URL = r'http://(?P<domain>(?:www\.)?(?:roosterteeth\.com|achievementhunter\.com|fun\.haus))/episode/(?P<id>[^/]+)'
_TESTS = [
{
'params': {
# Without this parameter ytdl downloads the whole file.
'hls_prefer_native': True
},
'url': 'http://achievementhunter.com/episode/rage-quit-season-1-episode-199',
'md5': '828fe30ccdddf5d85e444e33686d531a',
'info_dict': {
'id': 'rage-quit-season-1-episode-199',
'ext': 'mp4',
'title': 'Rage Quit - No Time to Explain',
'description': 'There\'s no time to explain this video.',
'thumbnail': r're:^http://s3\.amazonaws\.com/cdn\.roosterteeth\.com/uploads/images/[a-f0-9-]+/md/[a-z0-9-]+\.jpeg$',
'protocol': 'm3u8',
'url': r're:^http://[a-zA-Z0-9.]+\.taucdn\.net/[0-9a-zA-Z]+/video/uploads/videos/[0-9a-f-]+/[0-9A-Z]+\.m3u8$',
}
},
{
'url': 'http://roosterteeth.com/episode/red-vs-blue-season-1-episode-1',
'md5': '80277833f3ed946b553d13cf8e27443d',
'info_dict': {
'id': 'red-vs-blue-season-1-episode-1',
'ext': 'mp4',
'title': 'Why Are We Here? - Episode 1 - Red vs. Blue Season 1',
'thumbnail': r're:^https://i\.ytimg\.com/vi/[0-9a-zA-Z]+/maxresdefault\.jpg$',
'url': r're:^https://[0-9a-z-]+\.googlevideo\.com/videoplayback',
'upload_date': '20150306',
'uploader_id': 'UCII0hP2Ycmhh5j8lS4cexBQ',
'uploader': 'Red vs. Blue',
'description': 'The first episode of Red vs. Blue introduces the main characters, and poses the all-important question, why are we here?'
}
}
]
_NETRC_MACHINE = 'roosterteeth'
_authed = None
_sponsor = None
def _real_initialize(self):
self._authed = {}
def _real_extract(self, url):
if '#;' in url:
url, params = url.split('#;')
params = compat_urllib_parse.parse_qs(params)
else:
params = {}
video_id = self._match_id(url)
html = self._download_webpage(url, video_id)
if html.find('Unfortunately, this is sponsor-only.') > -1:
domain = compat_urllib_parse_urlparse(url).netloc
release = re.search(r'<p>[^<]+ Releases ([0-9]+ [a-zA-Z]+) from now</p>', html)
if release:
release = ' The video will be public in %s.' % release.group(1)
else:
release = ''
if not self._login(domain):
raise ExtractorError("This video is sponsor-only. You didn't provide your credentials or the login failed.%s" % release, expected=True)
# Try again.
html = self._download_webpage(url, video_id)
if html.find('Unfortunately, this is sponsor-only.') > -1:
if not self._is_sponsor(domain):
raise ExtractorError('This video is sponsor-only but you are not a sponsor.%s' % release, expected=True)
else:
raise ExtractorError('This is a sponsor-only video and although I tried to login, it did not work.')
js = self._html_search_regex(r'<script src="https?://roosterteeth\.com/scripts/lib/(?:jwplayer|youtube)\.min\.js"></script>\s*<script>\s*([^<]+)\s*</script>', html, 'video info')
info = re.search(r'RT\.(?P<player>youtube|jwplayer)\.player\((?P<json>\{(?:[^}]|\}(?!\);))+\})\);', js)
if not info:
raise ExtractorError("Can't parse the video metadata! (%s)" % js)
player = info.group('player')
meta = self._parse_json(js_to_json(info.group('json')), video_id)
if player == 'jwplayer':
# Make sure that all values are there.
for attr in ('containerId', 'videoImage', 'videoTitle', 'manifest'):
if attr not in meta:
raise ExtractorError('Unexpected video info! Attribute %s is missing.' % attr)
video_image = meta['videoImage']
if video_image.startswith('//'):
video_image = 'http:' + video_image
res = {
'id': video_id,
'title': meta['videoTitle'].strip(),
'formats': self._extract_m3u8_formats(meta['manifest'], video_id, ext='mp4'),
'thumbnail': video_image
}
elif player == 'youtube':
if 'youtubeKey' not in meta:
raise ExtractorError('Invalid metadata for youtube video!')
res = self.url_result('https://youtube.com/watch?v=' + meta['youtubeKey'])
res['_type'] = 'url_transparent'
res['id'] = video_id
else:
raise ExtractorError('Unknown player type %s!' % player)
if 'season' in params:
res['season'] = params['season'][0]
desc = self._og_search_description(html)
if desc:
res['description'] = desc.strip()
return res
def _login(self, domain='roosterteeth.com'):
"""
Attempt to log in to RoosterTeeth (or Achievement Hunter).
NOTE: RT is planning to implement SSO which will probably change how this works.
"""
if domain in self._authed:
return self._authed[domain]
(username, password) = self._get_login_info()
# No authentication to be performed
if username is None:
return False
LOGIN_URL = 'http://%s/login' % domain
login_page, hdl = self._download_webpage_handle(
LOGIN_URL, None,
note='Downloading login page',
errnote='unable to fetch login page', fatal=False)
if login_page is False:
return False
if hdl.geturl() != LOGIN_URL:
# We were redirected which means that we're already logged in.
self._authed[domain] = True
return True
token = self._search_regex(r'(?s)<input.+?name="_token".+?value="(.+?)"',
login_page, 'Login token')
# Log in
login_form_strs = {
'_token': token,
'username': username,
'password': password
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
req = compat_urllib_request.Request(LOGIN_URL, login_data, {'Content-Type': 'application/x-www-form-urlencoded'})
login_results = self._download_webpage(
req, None,
note='Logging in', errnote='unable to log in', fatal=False)
if login_results is False:
return False
if login_results.find('Error in exception handler.') > -1 or login_results.find('Authentication failed. Please check and try again, or reset your password') > -1:
self.report_warning('unable to log in: bad username or password')
self._authed[domain] = False
return False
self._authed[domain] = True
return True
def _is_sponsor(self, domain='roosterteeth.com'):
if self._sponsor is None:
username, _ = self._get_login_info()
profile_page = 'http://%s/user/%s' % (domain, compat_urllib_parse.quote(username))
html = self._download_webpage(
profile_page, None,
note='Checking user profile...',
errnote='unable to access user profile', fatal=False)
if not html:
return False
user_info = self._search_regex(
r'<div class="sidebar-profile-header">\s*<p[^>]+>\s*<a href="%s">[^<]+</a>\s*<span>((?:[^<]|<(?!/span>))+)</span>' % (profile_page),
html, 'user status', fatal=False)
if not user_info:
return False
self._sponsor = '<i class="icon ion-star"></i>' in user_info
return self._sponsor