2014-02-26 07:06:31 +08:00
from __future__ import unicode_literals
2013-08-28 18:51:22 +08:00
import re
import json
from . common import InfoExtractor
2014-02-26 07:29:45 +08:00
from . youtube import YoutubeIE
2013-08-28 18:51:22 +08:00
from . . utils import (
2014-02-26 07:06:31 +08:00
compat_urlparse ,
2013-08-28 18:51:22 +08:00
clean_html ,
2014-02-26 07:41:13 +08:00
ExtractorError ,
2013-08-28 18:51:22 +08:00
get_element_by_id ,
)
class TechTVMITIE ( InfoExtractor ) :
2014-02-26 07:06:31 +08:00
IE_NAME = ' techtv.mit.edu '
2013-08-28 18:51:22 +08:00
_VALID_URL = r ' https?://techtv \ .mit \ .edu/(videos|embeds)/(?P<id> \ d+) '
_TEST = {
2014-02-26 07:06:31 +08:00
' url ' : ' http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set ' ,
' md5 ' : ' 1f8cb3e170d41fd74add04d3c9330e5f ' ,
' info_dict ' : {
' id ' : ' 25418 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' MIT DNA Learning Center Set ' ,
' description ' : ' md5:82313335e8a8a3f243351ba55bc1b474 ' ,
2013-08-28 18:51:22 +08:00
} ,
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' id ' )
2013-08-29 03:00:59 +08:00
raw_page = self . _download_webpage (
2013-08-28 18:51:22 +08:00
' http://techtv.mit.edu/videos/ %s ' % video_id , video_id )
2014-02-26 07:06:31 +08:00
clean_page = re . compile ( r ' <!--.*?--> ' , re . S ) . sub ( ' ' , raw_page )
2013-08-28 18:51:22 +08:00
2014-02-26 07:06:31 +08:00
base_url = self . _search_regex (
r ' ipadUrl: \' (.+?cloudfront.net/) ' , raw_page , ' base url ' )
formats_json = self . _search_regex (
r ' bitrates: ( \ [.+? \ ]) ' , raw_page , ' video formats ' )
2013-12-24 19:38:08 +08:00
formats_mit = json . loads ( formats_json )
formats = [
{
' format_id ' : f [ ' label ' ] ,
' url ' : base_url + f [ ' url ' ] . partition ( ' : ' ) [ 2 ] ,
' ext ' : f [ ' url ' ] . partition ( ' : ' ) [ 0 ] ,
' format ' : f [ ' label ' ] ,
' width ' : f [ ' width ' ] ,
' vbr ' : f [ ' bitrate ' ] ,
}
for f in formats_mit
]
2013-08-28 18:51:22 +08:00
2013-08-29 03:00:59 +08:00
title = get_element_by_id ( ' edit-title ' , clean_page )
description = clean_html ( get_element_by_id ( ' edit-description ' , clean_page ) )
2014-02-26 07:06:31 +08:00
thumbnail = self . _search_regex (
r ' playlist:.*?url: \' (.+?) \' ' ,
raw_page , ' thumbnail ' , flags = re . DOTALL )
2013-08-28 18:51:22 +08:00
2014-02-26 07:06:31 +08:00
return {
' id ' : video_id ,
' title ' : title ,
' formats ' : formats ,
' description ' : description ,
' thumbnail ' : thumbnail ,
}
2013-08-28 18:51:22 +08:00
class MITIE ( TechTVMITIE ) :
2014-02-26 07:06:31 +08:00
IE_NAME = ' video.mit.edu '
2013-08-28 18:51:22 +08:00
_VALID_URL = r ' https?://video \ .mit \ .edu/watch/(?P<title>[^/]+) '
_TEST = {
2014-02-26 07:06:31 +08:00
' url ' : ' http://video.mit.edu/watch/the-government-is-profiling-you-13222/ ' ,
' file ' : ' .mp4 ' ,
' md5 ' : ' 7db01d5ccc1895fc5010e9c9e13648da ' ,
' info_dict ' : {
' id ' : ' 21783 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Government is Profiling You ' ,
' description ' : ' md5:ad5795fe1e1623b73620dbfd47df9afd ' ,
2013-08-28 18:51:22 +08:00
} ,
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
page_title = mobj . group ( ' title ' )
webpage = self . _download_webpage ( url , page_title )
2014-02-26 07:06:31 +08:00
embed_url = self . _search_regex (
r ' <iframe .*?src= " (.+?) " ' , webpage , ' embed url ' )
2013-08-28 18:51:22 +08:00
return self . url_result ( embed_url , ie = ' TechTVMIT ' )
2014-02-26 04:44:34 +08:00
2014-02-26 07:29:45 +08:00
2014-02-26 04:44:34 +08:00
class OCWMITIE ( InfoExtractor ) :
2014-02-26 07:29:45 +08:00
IE_NAME = ' ocw.mit.edu '
2014-02-26 04:44:34 +08:00
_VALID_URL = r ' ^http://ocw \ .mit \ .edu/courses/(?P<topic>[a-z0-9 \ -]+) '
2014-02-26 07:29:45 +08:00
_BASE_URL = ' http://ocw.mit.edu/ '
2014-02-26 04:44:34 +08:00
_TESTS = [
{
2014-02-26 07:29:45 +08:00
' url ' : ' http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/ ' ,
' info_dict ' : {
' id ' : ' EObHWIEKGjA ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence ' ,
' description ' : ' In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution. ' ,
#'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
2014-02-26 04:44:34 +08:00
}
} ,
{
2014-02-26 07:29:45 +08:00
' url ' : ' http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/ ' ,
' info_dict ' : {
' id ' : ' 7K1sB05pE0A ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Session 1: Introduction to Derivatives ' ,
' description ' : ' This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos. ' ,
#'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
2014-02-26 04:44:34 +08:00
}
}
]
def _real_extract ( self , url ) :
2014-02-26 07:29:45 +08:00
mobj = re . match ( self . _VALID_URL , url )
topic = mobj . group ( ' topic ' )
webpage = self . _download_webpage ( url , topic )
title = self . _html_search_meta ( ' WT.cg_s ' , webpage )
description = self . _html_search_meta ( ' Description ' , webpage )
2014-02-26 04:44:34 +08:00
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
embed_chapter_media = re . search ( r ' ocw_embed_chapter_media \ ((.+?) \ ) ' , webpage )
if embed_chapter_media :
2014-02-26 07:29:45 +08:00
metadata = re . sub ( r ' [ \' " ] ' , ' ' , embed_chapter_media . group ( 1 ) )
2014-02-26 04:44:34 +08:00
metadata = re . split ( r ' , ? ' , metadata )
yt = metadata [ 1 ]
subs = compat_urlparse . urljoin ( self . _BASE_URL , metadata [ 7 ] )
else :
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
embed_media = re . search ( r ' ocw_embed_media \ ((.+?) \ ) ' , webpage )
if embed_media :
2014-02-26 07:29:45 +08:00
metadata = re . sub ( r ' [ \' " ] ' , ' ' , embed_media . group ( 1 ) )
2014-02-26 04:44:34 +08:00
metadata = re . split ( r ' , ? ' , metadata )
yt = metadata [ 1 ]
subs = compat_urlparse . urljoin ( self . _BASE_URL , metadata [ 5 ] )
else :
raise ExtractorError ( ' Unable to find embedded YouTube video. ' )
2014-02-26 07:29:45 +08:00
video_id = YoutubeIE . extract_id ( yt )
2014-02-26 04:44:34 +08:00
2014-02-26 07:29:45 +08:00
return {
' _type ' : ' url_transparent ' ,
' id ' : video_id ,
' title ' : title ,
' description ' : description ,
' url ' : yt ,
' url_transparent '
' subtitles ' : subs ,
' ie_key ' : ' Youtube ' ,
}