2014-01-17 10:52:17 +08:00
from __future__ import unicode_literals
2013-06-24 03:55:53 +08:00
import json
import re
2013-11-03 02:48:39 +08:00
from . subtitles import SubtitlesInfoExtractor
2013-06-24 03:55:53 +08:00
2013-11-05 19:00:13 +08:00
from . . utils import (
RegexNotFoundError ,
)
2014-01-17 10:52:17 +08:00
2013-11-03 02:48:39 +08:00
class TEDIE ( SubtitlesInfoExtractor ) :
2014-03-05 04:47:01 +08:00
_VALID_URL = r ''' (?x)http://www \ .ted \ .com/
2013-06-24 03:55:53 +08:00
(
( ( ? P < type_playlist > playlists ) / ( ? P < playlist_id > \d + ) ) # We have a playlist
|
( ( ? P < type_talk > talks ) ) # We have a simple talk
)
( / lang / ( . * ? ) ) ? # The url may contain the language
/ ( ? P < name > \w + ) # Here goes the name and then ".html"
'''
2013-06-28 02:46:46 +08:00
_TEST = {
2014-01-17 10:52:17 +08:00
' url ' : ' http://www.ted.com/talks/dan_dennett_on_our_consciousness.html ' ,
' file ' : ' 102.mp4 ' ,
2014-01-17 10:54:54 +08:00
' md5 ' : ' 4ea1dada91e4174b53dac2bb8ace429d ' ,
2014-01-17 10:52:17 +08:00
' info_dict ' : {
2014-03-05 04:47:01 +08:00
' title ' : ' The illusion of consciousness ' ,
' description ' : ' Philosopher Dan Dennett makes a compelling argument that not only don \' t we understand our own consciousness, but that half the time our brains are actively fooling us. ' ,
' uploader ' : ' Dan Dennett ' ,
2013-06-28 02:46:46 +08:00
}
}
2013-06-24 03:55:53 +08:00
2014-03-05 04:47:01 +08:00
_FORMATS_PREFERENCE = {
' low ' : 1 ,
' medium ' : 2 ,
' high ' : 3 ,
}
2013-06-24 03:55:53 +08:00
def _real_extract ( self , url ) :
m = re . match ( self . _VALID_URL , url , re . VERBOSE )
if m . group ( ' type_talk ' ) :
2013-11-10 19:09:12 +08:00
return self . _talk_info ( url )
2013-06-24 03:55:53 +08:00
else :
playlist_id = m . group ( ' playlist_id ' )
name = m . group ( ' name ' )
self . to_screen ( u ' Getting info of playlist %s : " %s " ' % ( playlist_id , name ) )
return [ self . _playlist_videos_info ( url , name , playlist_id ) ]
2013-11-15 21:33:51 +08:00
def _playlist_videos_info ( self , url , name , playlist_id ) :
2013-06-24 03:55:53 +08:00
''' Returns the videos of the playlist '''
2013-11-15 21:33:51 +08:00
webpage = self . _download_webpage (
2014-01-17 10:52:17 +08:00
url , playlist_id , ' Downloading playlist webpage ' )
2013-11-15 21:33:51 +08:00
matches = re . finditer (
r ' <p \ s+class= " talk-title[^ " ]* " ><a \ s+href= " (?P<talk_url>/talks/[^ " ]+ \ .html) " >[^<]*</a></p> ' ,
webpage )
2013-06-24 03:55:53 +08:00
playlist_title = self . _html_search_regex ( r ' div class= " headline " > \ s*?<h1> \ s*?<span>(.*?)</span> ' ,
webpage , ' playlist title ' )
2013-11-15 21:33:51 +08:00
playlist_entries = [
self . url_result ( u ' http://www.ted.com ' + m . group ( ' talk_url ' ) , ' TED ' )
for m in matches
]
return self . playlist_result (
playlist_entries , playlist_id = playlist_id , playlist_title = playlist_title )
2013-06-24 03:55:53 +08:00
def _talk_info ( self , url , video_id = 0 ) :
""" Return the video for the talk in the url """
2014-03-05 04:47:01 +08:00
m = re . match ( self . _VALID_URL , url )
2013-06-24 03:55:53 +08:00
video_name = m . group ( ' name ' )
webpage = self . _download_webpage ( url , video_id , ' Downloading \" %s \" page ' % video_name )
self . report_extraction ( video_name )
2013-11-03 02:48:39 +08:00
2014-03-05 04:47:01 +08:00
info_json = self . _search_regex ( r ' " talkPage.init " ,( { .+}) \ )</script> ' , webpage , ' info json ' )
info = json . loads ( info_json )
talk_info = info [ ' talks ' ] [ 0 ]
2013-11-03 02:48:39 +08:00
2014-03-05 04:47:01 +08:00
formats = [ {
' ext ' : ' mp4 ' ,
' url ' : format_url ,
' format_id ' : format_id ,
' format ' : format_id ,
' preference ' : self . _FORMATS_PREFERENCE . get ( format_id , - 1 ) ,
} for ( format_id , format_url ) in talk_info [ ' nativeDownloads ' ] . items ( ) ]
self . _sort_formats ( formats )
video_id = talk_info [ ' id ' ]
2013-11-03 02:48:39 +08:00
# subtitles
2014-03-05 04:47:01 +08:00
video_subtitles = self . extract_subtitles ( video_id , talk_info )
2013-11-03 02:48:39 +08:00
if self . _downloader . params . get ( ' listsubtitles ' , False ) :
2014-03-05 04:47:01 +08:00
self . _list_available_subtitles ( video_id , talk_info )
2013-11-03 02:48:39 +08:00
return
2013-11-15 21:06:38 +08:00
return {
2013-11-03 02:48:39 +08:00
' id ' : video_id ,
2014-03-05 04:47:01 +08:00
' title ' : talk_info [ ' title ' ] ,
' uploader ' : talk_info [ ' speaker ' ] ,
' thumbnail ' : talk_info [ ' thumb ' ] ,
' description ' : self . _og_search_description ( webpage ) ,
2013-11-03 02:48:39 +08:00
' subtitles ' : video_subtitles ,
2013-10-04 16:32:34 +08:00
' formats ' : formats ,
}
2014-03-05 04:47:01 +08:00
def _get_available_subtitles ( self , video_id , talk_info ) :
languages = [ lang [ ' languageCode ' ] for lang in talk_info . get ( ' languages ' , [ ] ) ]
if languages :
sub_lang_list = { }
for l in languages :
url = ' http://www.ted.com/talks/subtitles/id/ %s /lang/ %s /format/srt ' % ( video_id , l )
sub_lang_list [ l ] = url
return sub_lang_list
else :
2013-11-05 19:00:13 +08:00
self . _downloader . report_warning ( u ' video doesn \' t have subtitles ' )
2014-03-05 04:47:01 +08:00
return { }