2014-03-25 06:21:20 +08:00
from __future__ import unicode_literals
import re
from . common import InfoExtractor
from . . utils import (
int_or_none ,
strip_jsonp ,
)
class WashingtonPostIE ( InfoExtractor ) :
2015-01-07 18:21:40 +08:00
_VALID_URL = r ' https?://(?:www \ .)?washingtonpost \ .com/.*?/(?P<id>[^/]+)/(?:$|[?#]) '
2014-03-25 06:21:20 +08:00
_TEST = {
' url ' : ' http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/ ' ,
2014-09-04 20:34:40 +08:00
' info_dict ' : {
' title ' : ' Sinkhole of bureaucracy ' ,
} ,
2014-03-25 06:21:20 +08:00
' playlist ' : [ {
2015-01-07 18:21:40 +08:00
' md5 ' : ' 79132cc09ec5309fa590ae46e4cc31bc ' ,
2014-03-25 06:21:20 +08:00
' info_dict ' : {
' id ' : ' fc433c38-b146-11e3-b8b3-44b1d1cd4c1f ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Breaking Points: The Paper Mine ' ,
' duration ' : 1287 ,
' description ' : ' Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains. ' ,
' uploader ' : ' The Washington Post ' ,
' timestamp ' : 1395527908 ,
' upload_date ' : ' 20140322 ' ,
} ,
} , {
2015-01-07 18:21:40 +08:00
' md5 ' : ' e1d5734c06865cc504ad99dc2de0d443 ' ,
2014-03-25 06:21:20 +08:00
' info_dict ' : {
' id ' : ' 41255e28-b14a-11e3-b8b3-44b1d1cd4c1f ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The town bureaucracy sustains ' ,
' description ' : ' Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground. ' ,
' duration ' : 2217 ,
' timestamp ' : 1395528005 ,
' upload_date ' : ' 20140322 ' ,
' uploader ' : ' The Washington Post ' ,
} ,
} ]
}
def _real_extract ( self , url ) :
2015-01-07 18:21:40 +08:00
page_id = self . _match_id ( url )
2014-03-25 06:21:20 +08:00
webpage = self . _download_webpage ( url , page_id )
2015-01-07 18:21:40 +08:00
2014-03-25 06:21:20 +08:00
title = self . _og_search_title ( webpage )
uuids = re . findall ( r ' data-video-uuid= " ([^ " ]+) " ' , webpage )
entries = [ ]
for i , uuid in enumerate ( uuids , start = 1 ) :
vinfo_all = self . _download_json (
' http://www.washingtonpost.com/posttv/c/videojson/ %s ?resType=jsonp ' % uuid ,
page_id ,
transform_source = strip_jsonp ,
note = ' Downloading information of video %d / %d ' % ( i , len ( uuids ) )
)
vinfo = vinfo_all [ 0 ] [ ' contentConfig ' ]
uploader = vinfo . get ( ' credits ' , { } ) . get ( ' source ' )
timestamp = int_or_none (
vinfo . get ( ' dateConfig ' , { } ) . get ( ' dateFirstPublished ' ) , 1000 )
formats = [ {
' format_id ' : (
' %s - %s - %s ' % ( s . get ( ' type ' ) , s . get ( ' width ' ) , s . get ( ' bitrate ' ) )
if s . get ( ' width ' )
else s . get ( ' type ' ) ) ,
' vbr ' : s . get ( ' bitrate ' ) if s . get ( ' width ' ) != 0 else None ,
' width ' : s . get ( ' width ' ) ,
' height ' : s . get ( ' height ' ) ,
' acodec ' : s . get ( ' audioCodec ' ) ,
' vcodec ' : s . get ( ' videoCodec ' ) if s . get ( ' width ' ) != 0 else ' none ' ,
' filesize ' : s . get ( ' fileSize ' ) ,
' url ' : s . get ( ' url ' ) ,
' ext ' : ' mp4 ' ,
' protocol ' : {
' MP4 ' : ' http ' ,
' F4F ' : ' f4m ' ,
} . get ( s . get ( ' type ' ) )
} for s in vinfo . get ( ' streams ' , [ ] ) ]
source_media_url = vinfo . get ( ' sourceMediaURL ' )
if source_media_url :
formats . append ( {
' format_id ' : ' source_media ' ,
' url ' : source_media_url ,
} )
self . _sort_formats ( formats )
entries . append ( {
' id ' : uuid ,
' title ' : vinfo [ ' title ' ] ,
' description ' : vinfo . get ( ' blurb ' ) ,
' uploader ' : uploader ,
' formats ' : formats ,
' duration ' : int_or_none ( vinfo . get ( ' videoDuration ' ) , 100 ) ,
' timestamp ' : timestamp ,
} )
return {
' _type ' : ' playlist ' ,
' entries ' : entries ,
' id ' : page_id ,
' title ' : title ,
}