2015-07-13 20:41:38 +08:00
# coding: utf-8
from __future__ import unicode_literals
import re
2015-11-03 18:36:54 +08:00
import os . path
2015-07-13 20:41:38 +08:00
from . common import InfoExtractor
2015-11-03 18:36:54 +08:00
from . . compat import compat_urlparse
from . . utils import (
url_basename ,
remove_start ,
)
2015-07-13 20:41:38 +08:00
class DemocracynowIE ( InfoExtractor ) :
2015-11-03 18:36:54 +08:00
_VALID_URL = r ' https?://(?:www \ .)?democracynow.org/(?P<id>[^ \ ?]*) '
2015-07-13 20:41:38 +08:00
IE_NAME = ' democracynow '
_TESTS = [ {
' url ' : ' http://www.democracynow.org/shows/2015/7/3 ' ,
2015-11-03 21:24:10 +08:00
' md5 ' : ' fbb8fe3d7a56a5e12431ce2f9b2fab0d ' ,
2015-07-13 20:41:38 +08:00
' info_dict ' : {
' id ' : ' 2015-0703-001 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' July 03, 2015 - Democracy Now! ' ,
2015-11-03 18:36:54 +08:00
' description ' : ' A daily independent global news hour with Amy Goodman & Juan González " What to the Slave is 4th of July? " : James Earl Jones Reads Frederick Douglass \u2019 Historic Speech : " This Flag Comes Down Today " : Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : " We Shall Overcome " : Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs ' ,
2015-07-13 20:41:38 +08:00
} ,
2015-07-17 15:57:08 +08:00
} , {
2015-07-13 20:41:38 +08:00
' url ' : ' http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree ' ,
2015-11-03 21:24:10 +08:00
' md5 ' : ' fbb8fe3d7a56a5e12431ce2f9b2fab0d ' ,
2015-07-13 20:41:38 +08:00
' info_dict ' : {
' id ' : ' 2015-0703-001 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' " This Flag Comes Down Today " : Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag ' ,
' description ' : ' md5:4d2bc4f0d29f5553c2210a4bc7761a21 ' ,
} ,
} ]
def _real_extract ( self , url ) :
display_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , display_id )
2015-10-31 22:21:52 +08:00
description = self . _og_search_description ( webpage )
2015-07-13 20:41:38 +08:00
2015-11-04 00:09:55 +08:00
json_data = self . _parse_json ( self . _search_regex (
2015-11-03 18:36:54 +08:00
r ' <script[^>]+type= " text/json " [^>]*> \ s*( { [^>]+}) ' , webpage , ' json ' ) ,
display_id )
2015-07-13 20:41:38 +08:00
video_id = None
formats = [ ]
2015-11-03 18:36:54 +08:00
default_lang = ' en '
2015-07-13 20:41:38 +08:00
subtitles = { }
2015-11-03 18:36:54 +08:00
def add_subtitle_item ( lang , info_dict ) :
if lang not in subtitles :
subtitles [ lang ] = [ ]
subtitles [ lang ] . append ( info_dict )
# chapter_file are not subtitles
2015-11-04 00:09:55 +08:00
if ' caption_file ' in json_data :
2015-11-03 18:36:54 +08:00
add_subtitle_item ( default_lang , {
2015-11-04 00:09:55 +08:00
' url ' : compat_urlparse . urljoin ( url , json_data [ ' caption_file ' ] ) ,
2015-11-03 18:36:54 +08:00
} )
2015-11-04 00:09:55 +08:00
for subtitle_item in json_data . get ( ' captions ' , [ ] ) :
2015-11-03 18:36:54 +08:00
lang = subtitle_item . get ( ' language ' , ' ' ) . lower ( ) or default_lang
add_subtitle_item ( lang , {
' url ' : compat_urlparse . urljoin ( url , subtitle_item [ ' url ' ] ) ,
} )
2015-10-31 22:21:52 +08:00
for key in ( ' file ' , ' audio ' , ' video ' ) :
2015-11-04 00:09:55 +08:00
media_url = json_data . get ( key , ' ' )
2015-11-03 18:36:54 +08:00
if not media_url :
2015-07-13 20:41:38 +08:00
continue
2015-11-03 18:36:54 +08:00
media_url = re . sub ( r ' \ ?.* ' , ' ' , compat_urlparse . urljoin ( url , media_url ) )
video_id = video_id or remove_start ( os . path . splitext ( url_basename ( media_url ) ) [ 0 ] , ' dn ' )
2015-07-13 20:41:38 +08:00
formats . append ( {
2015-11-03 18:36:54 +08:00
' url ' : media_url ,
2015-07-13 20:41:38 +08:00
} )
2015-11-03 18:36:54 +08:00
2015-07-13 20:41:38 +08:00
self . _sort_formats ( formats )
2015-11-03 18:36:54 +08:00
return {
2015-11-04 00:13:00 +08:00
' id ' : video_id or display_id ,
' title ' : json_data [ ' title ' ] ,
2015-07-13 20:41:38 +08:00
' description ' : description ,
' subtitles ' : subtitles ,
' formats ' : formats ,
}