2014-03-23 06:05:31 +08:00
from __future__ import unicode_literals
2013-06-24 02:07:51 +08:00
import re
from . common import InfoExtractor
2014-12-13 19:24:42 +08:00
from . . compat import (
2013-06-24 02:07:51 +08:00
compat_parse_qs ,
2015-07-18 01:41:47 +08:00
compat_urllib_parse_unquote ,
2014-12-13 19:24:42 +08:00
)
from . . utils import (
2013-07-17 07:14:30 +08:00
determine_ext ,
2013-06-24 02:07:51 +08:00
ExtractorError ,
2014-08-22 07:36:07 +08:00
int_or_none ,
2015-11-22 00:18:17 +08:00
sanitized_Request ,
2016-03-26 04:19:24 +08:00
urlencode_postdata ,
2013-06-24 02:07:51 +08:00
)
2014-03-23 06:05:31 +08:00
class MetacafeIE ( InfoExtractor ) :
2016-03-21 23:36:32 +08:00
_VALID_URL = r ' https?://(?:www \ .)?metacafe \ .com/watch/([^/]+)/([^/]+)/.* '
2013-06-24 02:07:51 +08:00
_DISCLAIMER = ' http://www.metacafe.com/family_filter/ '
_FILTER_POST = ' http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user '
2014-03-23 06:05:31 +08:00
IE_NAME = ' metacafe '
2013-11-01 18:55:35 +08:00
_TESTS = [
2014-03-23 06:05:31 +08:00
# Youtube video
{
' add_ie ' : [ ' Youtube ' ] ,
2014-11-24 04:20:46 +08:00
' url ' : ' http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/ ' ,
2014-03-23 06:05:31 +08:00
' info_dict ' : {
' id ' : ' _aUehQsCQtM ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20090102 ' ,
2014-03-23 06:13:15 +08:00
' title ' : ' The Electric Company | " Short I " | PBS KIDS GO! ' ,
2014-03-23 06:05:31 +08:00
' description ' : ' md5:2439a8ef6d5a70e380c22f5ad323e5a8 ' ,
' uploader ' : ' PBS ' ,
' uploader_id ' : ' PBS '
}
2013-11-01 18:55:35 +08:00
} ,
2014-03-23 06:05:31 +08:00
# Normal metacafe video
{
' url ' : ' http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/ ' ,
' md5 ' : ' 6e0bca200eaad2552e6915ed6fd4d9ad ' ,
' info_dict ' : {
' id ' : ' 11121940 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' News: Stuff You Won \' t Do with Your PlayStation 4 ' ,
' uploader ' : ' ign ' ,
' description ' : ' Sony released a massive FAQ on the PlayStation Blog detailing the PS4 \' s capabilities and limitations. ' ,
} ,
2013-11-01 18:55:35 +08:00
} ,
2014-03-23 06:05:31 +08:00
# AnyClip video
{
' url ' : ' http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/ ' ,
' info_dict ' : {
' id ' : ' an-dVVXnuY7Jh77J ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Andromeda Strain (1971): Stop the Bomb Part 3 ' ,
' uploader ' : ' anyclip ' ,
' description ' : ' md5:38c711dd98f5bb87acf973d573442e67 ' ,
} ,
2013-11-01 18:55:35 +08:00
} ,
2014-03-23 06:05:31 +08:00
# age-restricted video
{
' url ' : ' http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/ ' ,
' md5 ' : ' 98dde7c1a35d02178e8ab7560fe8bd09 ' ,
' info_dict ' : {
' id ' : ' 5186653 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' BBC INTERNAL Christmas Tape \' 79 - UNCENSORED Outtakes, Etc. ' ,
' uploader ' : ' Dwayne Pipe ' ,
' description ' : ' md5:950bf4c581e2c059911fa3ffbe377e4b ' ,
' age_limit ' : 18 ,
} ,
2013-12-05 06:43:50 +08:00
} ,
2014-03-23 06:05:31 +08:00
# cbs video
{
2014-03-23 06:08:11 +08:00
' url ' : ' http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/ ' ,
2014-03-23 06:05:31 +08:00
' info_dict ' : {
2014-03-23 06:08:11 +08:00
' id ' : ' 8VD4r_Zws8VP ' ,
2014-03-23 06:05:31 +08:00
' ext ' : ' flv ' ,
2014-03-23 06:08:11 +08:00
' title ' : ' Open: This is Face the Nation, February 9 ' ,
' description ' : ' md5:8a9ceec26d1f7ed6eab610834cc1a476 ' ,
' duration ' : 96 ,
2014-03-23 06:05:31 +08:00
} ,
' params ' : {
# rtmp download
' skip_download ' : True ,
} ,
2013-12-05 06:43:50 +08:00
} ,
2014-08-22 07:36:07 +08:00
# Movieclips.com video
{
' url ' : ' http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/ ' ,
' info_dict ' : {
' id ' : ' mv-Wy7ZU ' ,
' ext ' : ' mp4 ' ,
' title ' : ' My Week with Marilyn - Do You Love Me? ' ,
' description ' : ' From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie. ' ,
' uploader ' : ' movie_trailers ' ,
' duration ' : 176 ,
} ,
' params ' : {
' skip_download ' : ' requires rtmpdump ' ,
}
}
2013-11-01 18:55:35 +08:00
]
2013-06-28 02:18:35 +08:00
2013-06-24 02:07:51 +08:00
def report_disclaimer ( self ) :
2014-03-23 06:05:31 +08:00
self . to_screen ( ' Retrieving disclaimer ' )
2013-06-24 02:07:51 +08:00
def _real_initialize ( self ) :
# Retrieve disclaimer
2013-12-09 05:24:55 +08:00
self . report_disclaimer ( )
2014-03-23 06:05:31 +08:00
self . _download_webpage ( self . _DISCLAIMER , None , False , ' Unable to retrieve disclaimer ' )
2013-06-24 02:07:51 +08:00
# Confirm age
disclaimer_form = {
' filters ' : ' 0 ' ,
' submit ' : " Continue - I ' m over 18 " ,
2014-03-23 06:05:31 +08:00
}
2016-03-26 04:19:24 +08:00
request = sanitized_Request ( self . _FILTER_POST , urlencode_postdata ( disclaimer_form ) )
2013-11-01 18:55:35 +08:00
request . add_header ( ' Content-Type ' , ' application/x-www-form-urlencoded ' )
2013-12-09 05:24:55 +08:00
self . report_age_confirmation ( )
2014-03-23 06:05:31 +08:00
self . _download_webpage ( request , None , False , ' Unable to confirm age ' )
2014-03-23 06:16:02 +08:00
2013-06-24 02:07:51 +08:00
def _real_extract ( self , url ) :
# Extract id and simplified title from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2014-03-23 06:05:31 +08:00
raise ExtractorError ( ' Invalid URL: %s ' % url )
2013-06-24 02:07:51 +08:00
video_id = mobj . group ( 1 )
2013-12-05 06:43:50 +08:00
# the video may come from an external site
m_external = re . match ( ' ^( \ w {2} )-(.*)$ ' , video_id )
if m_external is not None :
prefix , ext_id = m_external . groups ( )
# Check if video comes from YouTube
if prefix == ' yt ' :
return self . url_result ( ' http://www.youtube.com/watch?v= %s ' % ext_id , ' Youtube ' )
# CBS videos use theplatform.com
if prefix == ' cb ' :
return self . url_result ( ' theplatform: %s ' % ext_id , ' ThePlatform ' )
2013-06-24 02:07:51 +08:00
# Retrieve video webpage to extract further information
2015-11-22 00:18:17 +08:00
req = sanitized_Request ( ' http://www.metacafe.com/watch/ %s / ' % video_id )
2013-11-01 18:55:35 +08:00
# AnyClip videos require the flashversion cookie so that we get the link
# to the mp4 file
mobj_an = re . match ( r ' ^an-(.*?)$ ' , video_id )
if mobj_an :
req . headers [ ' Cookie ' ] = ' flashVersion=0; '
2013-07-17 07:14:30 +08:00
webpage = self . _download_webpage ( req , video_id )
2013-06-24 02:07:51 +08:00
# Extract URL, uploader and title from webpage
self . report_extraction ( video_id )
2014-08-21 19:37:19 +08:00
video_url = None
2015-12-05 23:12:02 +08:00
mobj = re . search ( r ' (?m)&(?:media|video)URL=([^&]+) ' , webpage )
2013-06-24 02:07:51 +08:00
if mobj is not None :
2015-07-18 01:41:47 +08:00
mediaURL = compat_urllib_parse_unquote ( mobj . group ( 1 ) )
2015-12-05 23:12:02 +08:00
video_ext = determine_ext ( mediaURL )
2013-06-24 02:07:51 +08:00
# Extract gdaKey if available
mobj = re . search ( r ' (?m)&gdaKey=(.*?)& ' , webpage )
if mobj is None :
video_url = mediaURL
else :
gdaKey = mobj . group ( 1 )
video_url = ' %s ?__gda__= %s ' % ( mediaURL , gdaKey )
2014-08-21 19:37:19 +08:00
if video_url is None :
2013-07-17 07:14:30 +08:00
mobj = re . search ( r ' <video src= " ([^ " ]+) " ' , webpage )
if mobj :
video_url = mobj . group ( 1 )
video_ext = ' mp4 '
2014-08-21 19:37:19 +08:00
if video_url is None :
flashvars = self . _search_regex (
r ' name= " flashvars " value= " (.*?) " ' , webpage , ' flashvars ' ,
default = None )
if flashvars :
2014-08-21 19:25:17 +08:00
vardict = compat_parse_qs ( flashvars )
2013-07-17 07:14:30 +08:00
if ' mediaData ' not in vardict :
2014-03-23 06:05:31 +08:00
raise ExtractorError ( ' Unable to extract media URL ' )
mobj = re . search (
r ' " mediaURL " : " (?P<mediaURL>http.*?) " ,(.*?) " key " : " (?P<key>.*?) " ' , vardict [ ' mediaData ' ] [ 0 ] )
2013-07-17 07:14:30 +08:00
if mobj is None :
2014-03-23 06:05:31 +08:00
raise ExtractorError ( ' Unable to extract media URL ' )
2013-07-17 07:14:30 +08:00
mediaURL = mobj . group ( ' mediaURL ' ) . replace ( ' \\ / ' , ' / ' )
video_url = ' %s ?__gda__= %s ' % ( mediaURL , mobj . group ( ' key ' ) )
video_ext = determine_ext ( video_url )
2014-08-22 07:36:07 +08:00
if video_url is None :
player_url = self . _search_regex (
r " swfobject \ .embedSWF \ ( ' ([^ ' ]+) ' " ,
webpage , ' config URL ' , default = None )
if player_url :
config_url = self . _search_regex (
r ' config=(.+)$ ' , player_url , ' config URL ' )
config_doc = self . _download_xml (
config_url , video_id ,
note = ' Downloading video config ' )
smil_url = config_doc . find ( ' .//properties ' ) . attrib [ ' smil_file ' ]
smil_doc = self . _download_xml (
smil_url , video_id ,
note = ' Downloading SMIL document ' )
base_url = smil_doc . find ( ' ./head/meta ' ) . attrib [ ' base ' ]
video_url = [ ]
for vn in smil_doc . findall ( ' .//video ' ) :
br = int ( vn . attrib [ ' system-bitrate ' ] )
play_path = vn . attrib [ ' src ' ]
video_url . append ( {
' format_id ' : ' smil- %d ' % br ,
' url ' : base_url ,
' play_path ' : play_path ,
' page_url ' : url ,
' player_url ' : player_url ,
' ext ' : play_path . partition ( ' : ' ) [ 0 ] ,
} )
2013-06-24 02:07:51 +08:00
2014-08-22 07:36:07 +08:00
if video_url is None :
raise ExtractorError ( ' Unsupported video type ' )
2014-08-21 19:37:19 +08:00
2014-08-21 19:25:17 +08:00
video_title = self . _html_search_regex (
r ' (?im)<title>(.*) - Video</title> ' , webpage , ' title ' )
2013-07-17 16:45:35 +08:00
description = self . _og_search_description ( webpage )
2014-02-26 06:41:36 +08:00
thumbnail = self . _og_search_thumbnail ( webpage )
2013-07-17 16:45:24 +08:00
video_uploader = self . _html_search_regex (
2014-11-24 04:39:15 +08:00
r ' submitter=(.*?);|googletag \ .pubads \ ( \ ) \ .setTargeting \ ( " (?:channel|submiter) " , " ([^ " ]+) " \ ); ' ,
webpage , ' uploader nickname ' , fatal = False )
2014-08-22 07:36:07 +08:00
duration = int_or_none (
self . _html_search_meta ( ' video:duration ' , webpage ) )
age_limit = (
18
2015-12-05 23:12:50 +08:00
if re . search ( r ' (?: " contentRating " :| " rating " ,) " restricted " ' , webpage )
2014-08-22 07:36:07 +08:00
else 0 )
2013-06-24 02:07:51 +08:00
2014-08-22 07:36:07 +08:00
if isinstance ( video_url , list ) :
formats = video_url
2013-11-01 18:55:35 +08:00
else :
2014-08-22 07:36:07 +08:00
formats = [ {
' url ' : video_url ,
' ext ' : video_ext ,
} ]
2013-11-01 18:55:35 +08:00
2014-08-22 07:36:07 +08:00
self . _sort_formats ( formats )
2013-07-17 16:49:49 +08:00
return {
2014-03-23 06:05:31 +08:00
' id ' : video_id ,
2013-07-17 16:45:35 +08:00
' description ' : description ,
2013-07-17 07:14:30 +08:00
' uploader ' : video_uploader ,
2014-03-23 06:05:31 +08:00
' title ' : video_title ,
2014-08-21 19:25:17 +08:00
' thumbnail ' : thumbnail ,
2013-11-01 18:55:35 +08:00
' age_limit ' : age_limit ,
2014-08-22 07:36:07 +08:00
' formats ' : formats ,
' duration ' : duration ,
2013-07-17 16:49:49 +08:00
}