2014-01-22 10:50:49 +08:00
from __future__ import unicode_literals
2013-06-24 02:50:22 +08:00
import re
2013-12-03 21:58:24 +08:00
from . mtv import MTVServicesInfoExtractor
2014-12-13 19:24:42 +08:00
from . . compat import (
2013-06-24 02:50:22 +08:00
compat_str ,
compat_urllib_parse ,
2014-12-13 19:24:42 +08:00
)
from . . utils import (
2013-06-24 02:50:22 +08:00
ExtractorError ,
2014-03-29 06:06:34 +08:00
float_or_none ,
2013-06-24 02:50:22 +08:00
unified_strdate ,
)
2013-12-03 21:58:24 +08:00
class ComedyCentralIE ( MTVServicesInfoExtractor ) :
2014-07-17 04:37:01 +08:00
_VALID_URL = r ''' (?x)https?://(?:www \ .)?cc \ .com/
2014-07-17 04:35:09 +08:00
( video - clips | episodes | cc - studios | video - collections | full - episodes )
2014-01-02 04:11:35 +08:00
/ ( ? P < title > . * ) '''
2014-01-22 10:50:49 +08:00
_FEED_URL = ' http://comedycentral.com/feeds/mrss/ '
2013-11-25 04:18:35 +08:00
_TEST = {
2014-07-17 04:37:01 +08:00
' url ' : ' http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother ' ,
2014-04-11 08:14:31 +08:00
' md5 ' : ' c4f48e9eda1b16dd10add0744344b6d8 ' ,
2014-01-22 10:50:49 +08:00
' info_dict ' : {
' id ' : ' cef0cbb3-e776-4bc9-b62e-8016deccb354 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother ' ,
' description ' : ' After a certain point, breastfeeding becomes c**kblocking. ' ,
2013-11-25 04:18:35 +08:00
} ,
}
2014-11-15 01:34:44 +08:00
class ComedyCentralShowsIE ( MTVServicesInfoExtractor ) :
2014-03-25 11:00:52 +08:00
IE_DESC = ' The Daily Show / The Colbert Report '
2015-01-23 01:15:58 +08:00
# urls can be abbreviations like :thedailyshow
2013-06-24 02:50:22 +08:00
# urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2015-01-23 01:15:58 +08:00
_VALID_URL = r ''' (?x)^(:(?P<shortname>tds|thedailyshow)
2014-03-25 11:00:52 +08:00
| https ? : / / ( : www \. ) ?
( ? P < showname > thedailyshow | thecolbertreport ) \. ( ? : cc \. ) ? com /
2014-04-09 17:43:13 +08:00
( ( ? : full - ) ? episodes / ( ? : [ 0 - 9 a - z ] { 6 } / ) ? ( ? P < episode > . * ) |
2013-06-24 02:50:22 +08:00
( ? P < clip >
2014-09-02 00:37:10 +08:00
( ? : ( ? : guests / [ ^ / ] + | videos | video - playlists | special - editions | news - team / [ ^ / ] + ) / [ ^ / ] + / ( ? P < videotitle > [ ^ / ? #]+))
2014-04-01 06:02:29 +08:00
| ( the - colbert - report - ( videos | collections ) / ( ? P < clipID > [ 0 - 9 ] + ) / [ ^ / ] * / ( ? P < cntitle > . * ? ) )
2014-04-01 06:25:11 +08:00
| ( watch / ( ? P < date > [ ^ / ] * ) / ( ? P < tdstitle > . * ) )
) |
2013-07-21 17:04:56 +08:00
( ? P < interview >
2015-01-31 02:43:46 +08:00
extended - interviews / ( ? P < interID > [ 0 - 9 a - z ] + ) /
( ? : playlist_tds_extended_ ) ? ( ? P < interview_title > [ ^ / ? #]*?)
( ? : / [ ^ / ? #]?|[?#]|$))))
2014-12-17 18:29:35 +08:00
'''
2014-09-01 23:57:45 +08:00
_TESTS = [ {
2014-03-25 11:00:52 +08:00
' url ' : ' http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart ' ,
2014-01-22 10:50:49 +08:00
' md5 ' : ' 4e2f5cb088a83cd8cdb7756132f9739d ' ,
' info_dict ' : {
2014-03-25 11:00:52 +08:00
' id ' : ' ab9ab3e7-5a98-4dbe-8b21-551dc0523d55 ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20121213 ' ,
' description ' : ' Kristen Stewart learns to let loose in " On the Road. " ' ,
' uploader ' : ' thedailyshow ' ,
2014-04-02 03:29:40 +08:00
' title ' : ' thedailyshow kristen-stewart part 1 ' ,
2013-06-28 02:46:46 +08:00
}
2015-01-31 02:43:46 +08:00
} , {
' url ' : ' http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview ' ,
' info_dict ' : {
' id ' : ' sarah-chayes-extended-interview ' ,
' description ' : ' Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book " Thieves of State: Why Corruption Threatens Global Security. " ' ,
' title ' : ' thedailyshow Sarah Chayes Extended Interview ' ,
} ,
2015-01-31 17:51:39 +08:00
' playlist ' : [
{
' info_dict ' : {
' id ' : ' 0baad492-cbec-4ec1-9e50-ad91c291127f ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20150129 ' ,
' description ' : ' Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book " Thieves of State: Why Corruption Threatens Global Security. " ' ,
' uploader ' : ' thedailyshow ' ,
' title ' : ' thedailyshow sarah-chayes-extended-interview part 1 ' ,
} ,
} ,
{
' info_dict ' : {
' id ' : ' 1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283 ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20150129 ' ,
' description ' : ' Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book " Thieves of State: Why Corruption Threatens Global Security. " ' ,
' uploader ' : ' thedailyshow ' ,
' title ' : ' thedailyshow sarah-chayes-extended-interview part 2 ' ,
} ,
} ,
] ,
2015-01-31 02:43:46 +08:00
' params ' : {
' skip_download ' : True ,
} ,
2014-09-01 23:57:45 +08:00
} , {
' url ' : ' http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114 ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3 ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights ' ,
' only_matching ' : True ,
2014-12-17 18:29:35 +08:00
} , {
' url ' : ' http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo ' ,
' only_matching ' : True ,
2014-09-01 23:57:45 +08:00
} , {
' url ' : ' http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food ' ,
' only_matching ' : True ,
2014-09-02 00:37:10 +08:00
} , {
' url ' : ' http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel ' ,
' only_matching ' : True ,
2014-09-01 23:57:45 +08:00
} ]
2013-06-24 02:50:22 +08:00
_available_formats = [ ' 3500 ' , ' 2200 ' , ' 1700 ' , ' 1200 ' , ' 750 ' , ' 400 ' ]
_video_extensions = {
' 3500 ' : ' mp4 ' ,
' 2200 ' : ' mp4 ' ,
' 1700 ' : ' mp4 ' ,
' 1200 ' : ' mp4 ' ,
' 750 ' : ' mp4 ' ,
' 400 ' : ' mp4 ' ,
}
_video_dimensions = {
2013-10-04 17:14:10 +08:00
' 3500 ' : ( 1280 , 720 ) ,
' 2200 ' : ( 960 , 540 ) ,
' 1700 ' : ( 768 , 432 ) ,
' 1200 ' : ( 640 , 360 ) ,
' 750 ' : ( 512 , 288 ) ,
' 400 ' : ( 384 , 216 ) ,
2013-06-24 02:50:22 +08:00
}
def _real_extract ( self , url ) :
2014-11-20 23:36:53 +08:00
mobj = re . match ( self . _VALID_URL , url )
2013-06-24 02:50:22 +08:00
if mobj . group ( ' shortname ' ) :
if mobj . group ( ' shortname ' ) in ( ' tds ' , ' thedailyshow ' ) :
2014-03-25 11:00:52 +08:00
url = ' http://thedailyshow.cc.com/full-episodes/ '
2013-06-24 02:50:22 +08:00
else :
2014-03-25 11:00:52 +08:00
url = ' http://thecolbertreport.cc.com/full-episodes/ '
2013-06-24 02:50:22 +08:00
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
assert mobj is not None
if mobj . group ( ' clip ' ) :
2014-04-01 06:02:29 +08:00
if mobj . group ( ' videotitle ' ) :
epTitle = mobj . group ( ' videotitle ' )
elif mobj . group ( ' showname ' ) == ' thedailyshow ' :
2013-06-24 02:50:22 +08:00
epTitle = mobj . group ( ' tdstitle ' )
else :
epTitle = mobj . group ( ' cntitle ' )
dlNewest = False
2013-07-21 17:04:56 +08:00
elif mobj . group ( ' interview ' ) :
epTitle = mobj . group ( ' interview_title ' )
dlNewest = False
2013-06-24 02:50:22 +08:00
else :
dlNewest = not mobj . group ( ' episode ' )
if dlNewest :
epTitle = mobj . group ( ' showname ' )
else :
epTitle = mobj . group ( ' episode ' )
2014-03-25 11:00:52 +08:00
show_name = mobj . group ( ' showname ' )
2013-06-24 02:50:22 +08:00
2014-03-25 11:00:52 +08:00
webpage , htmlHandle = self . _download_webpage_handle ( url , epTitle )
2013-06-24 02:50:22 +08:00
if dlNewest :
url = htmlHandle . geturl ( )
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
if mobj is None :
2014-01-22 10:50:49 +08:00
raise ExtractorError ( ' Invalid redirected URL: ' + url )
2013-06-24 02:50:22 +08:00
if mobj . group ( ' episode ' ) == ' ' :
2014-01-22 10:50:49 +08:00
raise ExtractorError ( ' Redirected URL is still not specific: ' + url )
2014-06-24 16:50:41 +08:00
epTitle = ( mobj . group ( ' episode ' ) or mobj . group ( ' videotitle ' ) ) . rpartition ( ' / ' ) [ - 1 ]
2013-06-24 02:50:22 +08:00
mMovieParams = re . findall ( ' (?:<param name= " movie " value= " |var url = " )(http://media.mtvnservices.com/([^ " ]*(?:episode|video).*?:.*?)) " ' , webpage )
if len ( mMovieParams ) == 0 :
# The Colbert Report embeds the information in a without
# a URL prefix; so extract the alternate reference
# and then add the URL prefix manually.
2014-03-26 13:06:49 +08:00
altMovieParams = re . findall ( ' data-mgid= " ([^ " ]*(?:episode|video|playlist).*?:.*?) " ' , webpage )
2013-06-24 02:50:22 +08:00
if len ( altMovieParams ) == 0 :
2014-01-22 10:50:49 +08:00
raise ExtractorError ( ' unable to find Flash URL in webpage ' + url )
2013-06-24 02:50:22 +08:00
else :
mMovieParams = [ ( " http://media.mtvnservices.com/ " + altMovieParams [ 0 ] , altMovieParams [ 0 ] ) ]
uri = mMovieParams [ 0 ] [ 1 ]
2014-03-25 21:27:23 +08:00
# Correct cc.com in uri
uri = re . sub ( r ' (episode:[^.]+)( \ .cc)? \ .com ' , r ' \ 1.cc.com ' , uri )
2014-03-25 11:00:52 +08:00
index_url = ' http:// %s .cc.com/feeds/mrss? %s ' % ( show_name , compat_urllib_parse . urlencode ( { ' uri ' : uri } ) )
idoc = self . _download_xml (
index_url , epTitle ,
' Downloading show index ' , ' Unable to download episode index ' )
title = idoc . find ( ' ./channel/title ' ) . text
description = idoc . find ( ' ./channel/description ' ) . text
entries = [ ]
item_els = idoc . findall ( ' .//item ' )
for part_num , itemEl in enumerate ( item_els ) :
upload_date = unified_strdate ( itemEl . findall ( ' ./pubDate ' ) [ 0 ] . text )
thumbnail = itemEl . find ( ' .// { http://search.yahoo.com/mrss/}thumbnail ' ) . attrib . get ( ' url ' )
content = itemEl . find ( ' .// { http://search.yahoo.com/mrss/}content ' )
2014-03-29 06:06:34 +08:00
duration = float_or_none ( content . attrib . get ( ' duration ' ) )
2014-03-25 11:00:52 +08:00
mediagen_url = content . attrib [ ' url ' ]
2014-04-02 03:38:07 +08:00
guid = itemEl . find ( ' ./guid ' ) . text . rpartition ( ' : ' ) [ - 1 ]
2014-03-25 11:00:52 +08:00
cdoc = self . _download_xml (
mediagen_url , epTitle ,
' Downloading configuration for segment %d / %d ' % ( part_num + 1 , len ( item_els ) ) )
2013-06-24 02:50:22 +08:00
turls = [ ]
for rendition in cdoc . findall ( ' .//rendition ' ) :
finfo = ( rendition . attrib [ ' bitrate ' ] , rendition . findall ( ' ./src ' ) [ 0 ] . text )
turls . append ( finfo )
2013-10-04 17:14:10 +08:00
formats = [ ]
for format , rtmp_video_url in turls :
w , h = self . _video_dimensions . get ( format , ( None , None ) )
formats . append ( {
2014-03-25 11:00:52 +08:00
' format_id ' : ' vhttp- %s ' % format ,
2013-10-04 17:14:10 +08:00
' url ' : self . _transform_rtmp_url ( rtmp_video_url ) ,
' ext ' : self . _video_extensions . get ( format , ' mp4 ' ) ,
' height ' : h ,
' width ' : w ,
} )
2014-03-25 11:00:52 +08:00
formats . append ( {
' format_id ' : ' rtmp- %s ' % format ,
2014-05-31 02:59:15 +08:00
' url ' : rtmp_video_url . replace ( ' viacomccstrm ' , ' viacommtvstrm ' ) ,
2014-03-25 11:00:52 +08:00
' ext ' : self . _video_extensions . get ( format , ' mp4 ' ) ,
' height ' : h ,
' width ' : w ,
} )
self . _sort_formats ( formats )
2013-06-24 02:50:22 +08:00
2015-02-26 17:59:35 +08:00
subtitles = self . _extract_subtitles ( cdoc , guid )
2014-03-26 06:46:51 +08:00
virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str ( part_num + 1 )
2014-03-25 11:00:52 +08:00
entries . append ( {
' id ' : guid ,
' title ' : virtual_id ,
2013-10-04 17:14:10 +08:00
' formats ' : formats ,
2014-03-25 11:00:52 +08:00
' uploader ' : show_name ,
' upload_date ' : upload_date ,
' duration ' : duration ,
' thumbnail ' : thumbnail ,
' description ' : description ,
2015-02-26 17:59:35 +08:00
' subtitles ' : subtitles ,
2013-12-03 21:21:06 +08:00
} )
2013-06-24 02:50:22 +08:00
2014-03-25 11:00:52 +08:00
return {
' _type ' : ' playlist ' ,
2015-01-31 02:43:46 +08:00
' id ' : epTitle ,
2014-03-25 11:00:52 +08:00
' entries ' : entries ,
2014-03-26 06:46:51 +08:00
' title ' : show_name + ' ' + title ,
2014-03-25 11:00:52 +08:00
' description ' : description ,
}