2014-01-07 17:04:48 +08:00
from __future__ import unicode_literals
2013-08-27 17:56:48 +08:00
import re
from . common import InfoExtractor
2013-12-26 20:49:44 +08:00
from . . utils import (
int_or_none ,
parse_duration ,
2014-02-11 21:38:17 +08:00
url_basename ,
2013-12-26 20:49:44 +08:00
)
2013-08-27 17:56:48 +08:00
2013-08-28 06:14:19 +08:00
2013-08-27 17:56:48 +08:00
class CNNIE ( InfoExtractor ) :
2016-08-21 02:00:25 +08:00
_VALID_URL = r ''' (?x)https?://(?:(?P<sub_domain>edition|www|money) \ .)?cnn \ .com/(?:video/(?:data/.+?| \ ?)/)?videos?/
2015-05-24 02:04:02 +08:00
( ? P < path > . + ? / ( ? P < title > [ ^ / ] + ? ) ( ? : \. ( ? : [ a - z \- ] + ) | ( ? = & ) ) ) '''
2013-08-27 17:56:48 +08:00
2013-08-28 06:14:19 +08:00
_TESTS = [ {
2014-01-07 17:04:48 +08:00
' url ' : ' http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn ' ,
' md5 ' : ' 3e6121ea48df7e2259fe73a0628605c4 ' ,
' info_dict ' : {
2014-12-29 22:27:09 +08:00
' id ' : ' sports/2013/06/09/nadal-1-on-1.cnn ' ,
2014-11-05 05:25:08 +08:00
' ext ' : ' mp4 ' ,
2014-01-07 17:04:48 +08:00
' title ' : ' Nadal wins 8th French Open title ' ,
' description ' : ' World Sport \' s Amanda Davies chats with 2013 French Open champion Rafael Nadal. ' ,
' duration ' : 135 ,
' upload_date ' : ' 20130609 ' ,
2013-08-27 17:56:48 +08:00
} ,
2014-11-24 04:39:15 +08:00
} , {
2016-02-14 17:37:17 +08:00
' url ' : ' http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed % 3A+rss %2F cnn_topstories+ % 28RSS % 3A+Top+Stories % 29 ' ,
' md5 ' : ' b5cc60c60a3477d185af8f19a2a26f4e ' ,
' info_dict ' : {
2014-11-05 05:25:08 +08:00
' id ' : ' us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology ' ,
' ext ' : ' mp4 ' ,
2016-02-14 17:37:17 +08:00
' title ' : " Student ' s epic speech stuns new freshmen " ,
' description ' : " A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \" 2001: A Space Odyssey. \" " ,
' upload_date ' : ' 20130821 ' ,
2013-08-28 06:14:19 +08:00
}
2014-12-30 03:50:28 +08:00
} , {
' url ' : ' http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html ' ,
' md5 ' : ' f14d02ebd264df951feb2400e2c25a1b ' ,
' info_dict ' : {
' id ' : ' living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Nashville Ep. 1: Hand crafted skateboards ' ,
' description ' : ' md5:e7223a503315c9f150acac52e76de086 ' ,
' upload_date ' : ' 20141222 ' ,
}
2016-08-21 02:00:25 +08:00
} , {
' url ' : ' http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html ' ,
' md5 ' : ' 52a515dc1b0f001cd82e4ceda32be9d1 ' ,
' info_dict ' : {
' id ' : ' /video/news/2016/08/19/netflix-stunning-stats.cnnmoney ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 5 stunning stats about Netflix ' ,
' description ' : ' Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn \' t know. ' ,
' upload_date ' : ' 20160819 ' ,
}
2015-03-29 06:39:41 +08:00
} , {
' url ' : ' http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk ' ,
' only_matching ' : True ,
2015-04-07 20:59:13 +08:00
} , {
' url ' : ' http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg ' ,
' only_matching ' : True ,
2016-08-21 02:00:25 +08:00
} , {
' url ' : ' http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn ' ,
' only_matching ' : True ,
2013-08-28 06:14:19 +08:00
} ]
2013-08-27 17:56:48 +08:00
2016-08-21 02:00:25 +08:00
_CONFIG = {
# http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
' edition ' : {
' data_src ' : ' http://edition.cnn.com/video/data/3.0/video/ %s /index.xml ' ,
' media_src ' : ' http://pmd.cdn.turner.com/cnn/big ' ,
} ,
# http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
' money ' : {
' data_src ' : ' http://money.cnn.com/video/data/4.0/video/ %s .xml ' ,
' media_src ' : ' http://ht3.cdn.turner.com/money/big ' ,
} ,
}
2013-08-27 17:56:48 +08:00
def _real_extract ( self , url ) :
2016-08-21 02:00:25 +08:00
sub_domain , path , page_title = re . match ( self . _VALID_URL , url ) . groups ( )
if sub_domain not in ( ' money ' , ' edition ' ) :
sub_domain = ' edition '
config = self . _CONFIG [ sub_domain ]
info_url = config [ ' data_src ' ] % path
2013-11-27 01:48:52 +08:00
info = self . _download_xml ( info_url , page_title )
2013-08-27 17:56:48 +08:00
formats = [ ]
2013-12-26 20:49:44 +08:00
rex = re . compile ( r ''' (?x)
( ? P < width > [ 0 - 9 ] + ) x ( ? P < height > [ 0 - 9 ] + )
( ? : _ ( ? P < bitrate > [ 0 - 9 ] + ) k ) ?
''' )
2013-08-27 17:56:48 +08:00
for f in info . findall ( ' files/file ' ) :
2016-08-21 02:00:25 +08:00
video_url = config [ ' media_src ' ] + f . text . strip ( )
2013-12-26 20:49:44 +08:00
fdct = {
' format_id ' : f . attrib [ ' bitrate ' ] ,
' url ' : video_url ,
}
mf = rex . match ( f . attrib [ ' bitrate ' ] )
if mf :
fdct [ ' width ' ] = int ( mf . group ( ' width ' ) )
fdct [ ' height ' ] = int ( mf . group ( ' height ' ) )
fdct [ ' tbr ' ] = int_or_none ( mf . group ( ' bitrate ' ) )
else :
mf = rex . search ( f . text )
if mf :
fdct [ ' width ' ] = int ( mf . group ( ' width ' ) )
fdct [ ' height ' ] = int ( mf . group ( ' height ' ) )
fdct [ ' tbr ' ] = int_or_none ( mf . group ( ' bitrate ' ) )
else :
mi = re . match ( r ' ios_(audio|[0-9]+)$ ' , f . attrib [ ' bitrate ' ] )
if mi :
if mi . group ( 1 ) == ' audio ' :
fdct [ ' vcodec ' ] = ' none '
fdct [ ' ext ' ] = ' m4a '
else :
fdct [ ' tbr ' ] = int ( mi . group ( 1 ) )
formats . append ( fdct )
self . _sort_formats ( formats )
2013-08-27 17:56:48 +08:00
2014-06-07 21:39:21 +08:00
thumbnails = [ {
' height ' : int ( t . attrib [ ' height ' ] ) ,
' width ' : int ( t . attrib [ ' width ' ] ) ,
' url ' : t . text ,
} for t in info . findall ( ' images/image ' ) ]
2013-08-27 17:56:48 +08:00
2013-12-26 20:49:44 +08:00
metas_el = info . find ( ' metas ' )
upload_date = (
metas_el . attrib . get ( ' version ' ) if metas_el is not None else None )
duration_el = info . find ( ' length ' )
duration = parse_duration ( duration_el . text )
return {
' id ' : info . attrib [ ' id ' ] ,
' title ' : info . find ( ' headline ' ) . text ,
' formats ' : formats ,
2014-06-07 21:39:21 +08:00
' thumbnails ' : thumbnails ,
2013-12-26 20:49:44 +08:00
' description ' : info . find ( ' description ' ) . text ,
' duration ' : duration ,
' upload_date ' : upload_date ,
}
2014-02-11 21:38:17 +08:00
class CNNBlogsIE ( InfoExtractor ) :
_VALID_URL = r ' https?://[^ \ .]+ \ .blogs \ .cnn \ .com/.+ '
_TEST = {
' url ' : ' http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/ ' ,
' md5 ' : ' 3e56f97b0b6ffb4b79f4ea0749551084 ' ,
' info_dict ' : {
' id ' : ' bestoftv/2014/02/09/criminalizing-journalism.cnn ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Criminalizing journalism? ' ,
' description ' : ' Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories. ' ,
' upload_date ' : ' 20140209 ' ,
} ,
' add_ie ' : [ ' CNN ' ] ,
}
def _real_extract ( self , url ) :
webpage = self . _download_webpage ( url , url_basename ( url ) )
cnn_url = self . _html_search_regex ( r ' data-url= " (.+?) " ' , webpage , ' cnn url ' )
return {
' _type ' : ' url ' ,
' url ' : cnn_url ,
' ie_key ' : CNNIE . ie_key ( ) ,
}
2014-12-23 01:40:36 +08:00
class CNNArticleIE ( InfoExtractor ) :
2016-08-21 02:00:25 +08:00
_VALID_URL = r ' https?://(?:(?:edition|www) \ .)?cnn \ .com/(?!videos?/) '
2014-12-23 01:40:36 +08:00
_TEST = {
' url ' : ' http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/ ' ,
2015-01-21 17:27:18 +08:00
' md5 ' : ' 689034c2a3d9c6dc4aa72d65a81efd01 ' ,
2014-12-23 01:40:36 +08:00
' info_dict ' : {
2015-01-21 17:27:18 +08:00
' id ' : ' bestoftv/2014/12/21/ip-north-korea-obama.cnn ' ,
2014-12-23 01:40:36 +08:00
' ext ' : ' mp4 ' ,
2015-01-21 17:27:18 +08:00
' title ' : ' Obama: Cyberattack not an act of war ' ,
' description ' : ' md5:51ce6750450603795cad0cdfbd7d05c5 ' ,
' upload_date ' : ' 20141221 ' ,
2014-12-23 01:40:36 +08:00
} ,
' add_ie ' : [ ' CNN ' ] ,
}
def _real_extract ( self , url ) :
webpage = self . _download_webpage ( url , url_basename ( url ) )
cnn_url = self . _html_search_regex ( r " video: \ s* ' ([^ ' ]+) ' " , webpage , ' cnn url ' )
return {
' _type ' : ' url ' ,
2014-12-29 22:27:09 +08:00
' url ' : ' http://cnn.com/video/?/video/ ' + cnn_url ,
2014-12-23 01:40:36 +08:00
' ie_key ' : CNNIE . ie_key ( ) ,
}