2014-01-05 11:30:00 +08:00
from __future__ import unicode_literals
2013-06-26 23:55:54 +08:00
import re
from . common import InfoExtractor
from . . utils import (
2014-04-03 11:56:28 +08:00
int_or_none ,
2014-01-05 11:30:00 +08:00
unescapeHTML ,
2014-02-03 01:24:20 +08:00
find_xpath_attr ,
2015-04-21 03:18:38 +08:00
smuggle_url ,
2015-04-24 23:46:51 +08:00
determine_ext ,
2015-10-18 04:30:38 +08:00
ExtractorError ,
2013-06-26 23:55:54 +08:00
)
2015-04-21 03:18:38 +08:00
from . senateisvp import SenateISVPIE
2013-06-26 23:55:54 +08:00
2014-01-05 11:30:00 +08:00
2013-06-26 23:55:54 +08:00
class CSpanIE ( InfoExtractor ) :
2016-03-21 23:36:32 +08:00
_VALID_URL = r ' https?://(?:www \ .)?c-span \ .org/video/ \ ?(?P<id>[0-9a-f]+) '
2014-01-05 11:30:00 +08:00
IE_DESC = ' C-SPAN '
2014-03-21 09:10:24 +08:00
_TESTS = [ {
2014-02-03 01:24:20 +08:00
' url ' : ' http://www.c-span.org/video/?313572-1/HolderonV ' ,
2015-10-18 04:30:38 +08:00
' md5 ' : ' 94b29a4f131ff03d23471dd6f60b6a1d ' ,
2014-01-05 11:30:00 +08:00
' info_dict ' : {
2014-02-03 01:24:20 +08:00
' id ' : ' 315139 ' ,
' ext ' : ' mp4 ' ,
2014-01-05 11:30:00 +08:00
' title ' : ' Attorney General Eric Holder on Voting Rights Act Decision ' ,
2015-10-04 02:28:48 +08:00
' description ' : ' Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced. ' ,
2013-06-28 02:46:46 +08:00
} ,
2014-01-22 22:10:00 +08:00
' skip ' : ' Regularly fails on travis, for unknown reasons ' ,
2014-03-21 09:10:24 +08:00
} , {
' url ' : ' http://www.c-span.org/video/?c4486943/cspan-international-health-care-models ' ,
2015-10-18 04:30:38 +08:00
' md5 ' : ' 8e5fbfabe6ad0f89f3012a7943c1287b ' ,
2014-03-21 09:10:24 +08:00
' info_dict ' : {
2015-10-04 02:28:48 +08:00
' id ' : ' c4486943 ' ,
2014-03-21 09:10:24 +08:00
' ext ' : ' mp4 ' ,
2015-10-04 02:28:48 +08:00
' title ' : ' CSPAN - International Health Care Models ' ,
2014-03-21 09:10:24 +08:00
' description ' : ' md5:7a985a2d595dba00af3d9c9f0783c967 ' ,
}
2014-08-28 06:58:24 +08:00
} , {
' url ' : ' http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall ' ,
2015-10-18 04:30:38 +08:00
' md5 ' : ' 2ae5051559169baadba13fc35345ae74 ' ,
2014-08-28 06:58:24 +08:00
' info_dict ' : {
' id ' : ' 342759 ' ,
2015-04-21 03:30:54 +08:00
' ext ' : ' mp4 ' ,
2014-08-28 06:58:24 +08:00
' title ' : ' General Motors Ignition Switch Recall ' ,
2015-04-21 03:30:54 +08:00
' duration ' : 14848 ,
2015-10-04 02:28:48 +08:00
' description ' : ' md5:118081aedd24bf1d3b68b3803344e7f3 '
2014-08-28 06:58:24 +08:00
} ,
2015-04-21 03:18:38 +08:00
} , {
# Video from senate.gov
' url ' : ' http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers ' ,
' info_dict ' : {
' id ' : ' judiciary031715 ' ,
' ext ' : ' flv ' ,
' title ' : ' Immigration Reforms Needed to Protect Skilled American Workers ' ,
}
2014-03-21 09:10:24 +08:00
} ]
2013-06-26 23:55:54 +08:00
def _real_extract ( self , url ) :
2015-10-04 02:28:48 +08:00
video_id = self . _match_id ( url )
2015-12-28 20:06:30 +08:00
video_type = None
2015-10-04 02:28:48 +08:00
webpage = self . _download_webpage ( url , video_id )
2015-12-28 20:48:10 +08:00
# We first look for clipid, because clipprog always appears before
patterns = [ r ' id= \' clip( %s ) \' \ s*value= \' ([0-9]+) \' ' % t for t in ( ' id ' , ' prog ' ) ]
results = list ( filter ( None , ( re . search ( p , webpage ) for p in patterns ) ) )
if results :
matches = results [ 0 ]
2015-10-04 02:28:48 +08:00
video_type , video_id = matches . groups ( )
2015-12-28 20:48:10 +08:00
video_type = ' clip ' if video_type == ' id ' else ' program '
2015-10-04 02:28:48 +08:00
else :
2016-01-26 22:42:20 +08:00
m = re . search ( r ' data-(?P<type>clip|prog)id=[ " \' ](?P<id> \ d+) ' , webpage )
if m :
video_id = m . group ( ' id ' )
video_type = ' program ' if m . group ( ' type ' ) == ' prog ' else ' clip '
else :
senate_isvp_url = SenateISVPIE . _search_iframe_url ( webpage )
if senate_isvp_url :
title = self . _og_search_title ( webpage )
surl = smuggle_url ( senate_isvp_url , { ' force_title ' : title } )
return self . url_result ( surl , ' SenateISVP ' , video_id , title )
2015-12-28 20:06:30 +08:00
if video_type is None or video_id is None :
raise ExtractorError ( ' unable to find video id and type ' )
2014-01-05 11:30:00 +08:00
2015-11-29 03:22:31 +08:00
def get_text_attr ( d , attr ) :
return d . get ( attr , { } ) . get ( ' #text ' )
2015-10-04 02:28:48 +08:00
data = self . _download_json (
2015-10-18 04:30:38 +08:00
' http://www.c-span.org/assets/player/ajax-player.php?os=android&html5= %s &id= %s ' % ( video_type , video_id ) ,
video_id ) [ ' video ' ]
if data [ ' @status ' ] != ' Success ' :
2015-11-29 03:22:31 +08:00
raise ExtractorError ( ' %s said: %s ' % ( self . IE_NAME , get_text_attr ( data , ' error ' ) ) , expected = True )
2014-01-05 11:30:00 +08:00
2014-04-03 11:56:28 +08:00
doc = self . _download_xml (
2015-10-04 02:28:48 +08:00
' http://www.c-span.org/common/services/flashXml.php? %s id= %s ' % ( video_type , video_id ) ,
2014-02-03 01:24:20 +08:00
video_id )
2015-10-04 02:28:48 +08:00
description = self . _html_search_meta ( ' description ' , webpage )
2014-04-03 11:56:28 +08:00
title = find_xpath_attr ( doc , ' .//string ' , ' name ' , ' title ' ) . text
thumbnail = find_xpath_attr ( doc , ' .//string ' , ' name ' , ' poster ' ) . text
2015-10-18 04:30:38 +08:00
files = data [ ' files ' ]
2015-11-29 03:22:31 +08:00
capfile = get_text_attr ( data , ' capfile ' )
2014-04-03 11:56:28 +08:00
2015-10-18 04:30:38 +08:00
entries = [ ]
for partnum , f in enumerate ( files ) :
formats = [ ]
for quality in f [ ' qualities ' ] :
formats . append ( {
2015-11-29 03:22:31 +08:00
' format_id ' : ' %s - %s p ' % ( get_text_attr ( quality , ' bitrate ' ) , get_text_attr ( quality , ' height ' ) ) ,
' url ' : unescapeHTML ( get_text_attr ( quality , ' file ' ) ) ,
' height ' : int_or_none ( get_text_attr ( quality , ' height ' ) ) ,
' tbr ' : int_or_none ( get_text_attr ( quality , ' bitrate ' ) ) ,
2015-10-18 04:30:38 +08:00
} )
2016-01-26 23:29:42 +08:00
if not formats :
2016-01-30 02:26:33 +08:00
path = unescapeHTML ( get_text_attr ( f , ' path ' ) )
2016-01-26 23:29:42 +08:00
if not path :
continue
formats = self . _extract_m3u8_formats (
path , video_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = ' hls ' ) if determine_ext ( path ) == ' m3u8 ' else [ { ' url ' : path , } ]
2015-10-18 04:30:38 +08:00
self . _sort_formats ( formats )
entries . append ( {
' id ' : ' %s _ %d ' % ( video_id , partnum + 1 ) ,
' title ' : (
title if len ( files ) == 1 else
' %s part %d ' % ( title , partnum + 1 ) ) ,
' formats ' : formats ,
' description ' : description ,
' thumbnail ' : thumbnail ,
2015-11-29 03:22:31 +08:00
' duration ' : int_or_none ( get_text_attr ( f , ' length ' ) ) ,
2015-10-18 04:30:38 +08:00
' subtitles ' : {
' en ' : [ {
' url ' : capfile ,
' ext ' : determine_ext ( capfile , ' dfxp ' )
} ] ,
} if capfile else None ,
} )
2014-02-03 01:24:20 +08:00
2015-04-21 03:30:54 +08:00
if len ( entries ) == 1 :
entry = dict ( entries [ 0 ] )
2015-10-04 02:28:48 +08:00
entry [ ' id ' ] = ' c ' + video_id if video_type == ' clip ' else video_id
2015-04-21 03:30:54 +08:00
return entry
else :
return {
' _type ' : ' playlist ' ,
' entries ' : entries ,
' title ' : title ,
2015-10-04 02:28:48 +08:00
' id ' : ' c ' + video_id if video_type == ' clip ' else video_id ,
2015-04-21 03:30:54 +08:00
}