2016-10-15 10:16:43 +08:00
# coding: utf-8
2014-05-03 03:28:38 +08:00
from __future__ import unicode_literals
2016-09-02 02:07:41 +08:00
import hmac
import hashlib
import base64
2014-05-03 03:28:38 +08:00
from . common import InfoExtractor
2015-03-19 23:23:52 +08:00
from . . utils import (
2016-10-16 19:21:02 +08:00
determine_ext ,
2015-03-19 23:23:52 +08:00
float_or_none ,
int_or_none ,
2016-10-15 10:16:43 +08:00
js_to_json ,
2016-09-02 02:07:41 +08:00
mimetype2ext ,
2016-10-16 19:21:02 +08:00
parse_iso8601 ,
remove_start ,
2015-03-19 23:23:52 +08:00
)
2014-05-03 03:28:38 +08:00
2015-05-04 22:32:57 +08:00
class NYTimesBaseIE ( InfoExtractor ) :
2016-09-02 02:07:41 +08:00
_SECRET = b ' pX(2MbU2);4N { 7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v '
2015-05-04 22:32:57 +08:00
def _extract_video_from_id ( self , video_id ) :
2016-09-02 02:07:41 +08:00
# Authorization generation algorithm is reverse engineered from `signer` in
# http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
path = ' /svc/video/api/v3/video/ ' + video_id
hm = hmac . new ( self . _SECRET , ( path + ' :vhs ' ) . encode ( ) , hashlib . sha512 ) . hexdigest ( )
video_data = self . _download_json ( ' http://www.nytimes.com ' + path , video_id , ' Downloading video JSON ' , headers = {
' Authorization ' : ' NYTV ' + base64 . b64encode ( hm . encode ( ) ) . decode ( ) ,
' X-NYTV ' : ' vhs ' ,
} , fatal = False )
if not video_data :
video_data = self . _download_json (
' http://www.nytimes.com/svc/video/api/v2/video/ ' + video_id ,
video_id , ' Downloading video JSON ' )
2014-05-03 03:28:38 +08:00
title = video_data [ ' headline ' ]
2014-05-03 04:11:38 +08:00
def get_file_size ( file_size ) :
if isinstance ( file_size , int ) :
return file_size
elif isinstance ( file_size , dict ) :
return int ( file_size . get ( ' value ' , 0 ) )
else :
2016-09-02 02:07:41 +08:00
return None
urls = [ ]
formats = [ ]
for video in video_data . get ( ' renditions ' , [ ] ) :
video_url = video . get ( ' url ' )
format_id = video . get ( ' type ' )
if not video_url or format_id == ' thumbs ' or video_url in urls :
continue
urls . append ( video_url )
ext = mimetype2ext ( video . get ( ' mimetype ' ) ) or determine_ext ( video_url )
if ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
video_url , video_id , ' mp4 ' , ' m3u8_native ' ,
m3u8_id = format_id or ' hls ' , fatal = False ) )
elif ext == ' mpd ' :
continue
# formats.extend(self._extract_mpd_formats(
# video_url, video_id, format_id or 'dash', fatal=False))
else :
formats . append ( {
' url ' : video_url ,
' format_id ' : format_id ,
' vcodec ' : video . get ( ' videoencoding ' ) or video . get ( ' video_codec ' ) ,
' width ' : int_or_none ( video . get ( ' width ' ) ) ,
' height ' : int_or_none ( video . get ( ' height ' ) ) ,
' filesize ' : get_file_size ( video . get ( ' file_size ' ) or video . get ( ' fileSize ' ) ) ,
' tbr ' : int_or_none ( video . get ( ' bitrate ' ) , 1000 ) ,
' ext ' : ext ,
} )
2014-05-03 03:28:38 +08:00
self . _sort_formats ( formats )
2016-09-02 02:07:41 +08:00
thumbnails = [ ]
for image in video_data . get ( ' images ' , [ ] ) :
image_url = image . get ( ' url ' )
if not image_url :
continue
thumbnails . append ( {
' url ' : ' http://www.nytimes.com/ ' + image_url ,
2015-03-19 23:23:52 +08:00
' width ' : int_or_none ( image . get ( ' width ' ) ) ,
' height ' : int_or_none ( image . get ( ' height ' ) ) ,
2016-09-02 02:07:41 +08:00
} )
publication_date = video_data . get ( ' publication_date ' )
timestamp = parse_iso8601 ( publication_date [ : - 8 ] ) if publication_date else None
2014-05-03 03:28:38 +08:00
return {
' id ' : video_id ,
' title ' : title ,
2016-09-02 02:07:41 +08:00
' description ' : video_data . get ( ' summary ' ) ,
2014-05-03 03:28:38 +08:00
' timestamp ' : timestamp ,
2016-09-02 02:07:41 +08:00
' uploader ' : video_data . get ( ' byline ' ) ,
' duration ' : float_or_none ( video_data . get ( ' duration ' ) , 1000 ) ,
2014-05-03 03:28:38 +08:00
' formats ' : formats ,
' thumbnails ' : thumbnails ,
2014-11-24 03:41:03 +08:00
}
2015-05-04 22:32:57 +08:00
class NYTimesIE ( NYTimesBaseIE ) :
_VALID_URL = r ' https?://(?:(?:www \ .)?nytimes \ .com/video/(?:[^/]+/)+?|graphics8 \ .nytimes \ .com/bcvideo/ \ d+(?: \ . \ d+)?/iframe/embed \ .html \ ?videoId=)(?P<id> \ d+) '
_TESTS = [ {
' url ' : ' http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263 ' ,
2016-09-02 02:07:41 +08:00
' md5 ' : ' d665342765db043f7e225cff19df0f2d ' ,
2015-05-04 22:32:57 +08:00
' info_dict ' : {
' id ' : ' 100000002847155 ' ,
' ext ' : ' mov ' ,
' title ' : ' Verbatim: What Is a Photocopier? ' ,
' description ' : ' md5:93603dada88ddbda9395632fdc5da260 ' ,
' timestamp ' : 1398631707 ,
' upload_date ' : ' 20140427 ' ,
' uploader ' : ' Brett Weiner ' ,
' duration ' : 419 ,
}
} , {
' url ' : ' http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html ' ,
' only_matching ' : True ,
} ]
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
return self . _extract_video_from_id ( video_id )
class NYTimesArticleIE ( NYTimesBaseIE ) :
2015-05-12 12:42:13 +08:00
_VALID_URL = r ' https?://(?:www \ .)?nytimes \ .com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?: \ .html)? '
2015-05-04 23:00:09 +08:00
_TESTS = [ {
2015-05-04 22:32:57 +08:00
' url ' : ' http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0 ' ,
' md5 ' : ' e2076d58b4da18e6a001d53fd56db3c9 ' ,
' info_dict ' : {
' id ' : ' 100000003628438 ' ,
' ext ' : ' mov ' ,
' title ' : ' New Minimum Wage: $70,000 a Year ' ,
' description ' : ' Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year. ' ,
' timestamp ' : 1429033037 ,
' upload_date ' : ' 20150414 ' ,
' uploader ' : ' Matthew Williams ' ,
}
2016-10-15 10:16:43 +08:00
} , {
' url ' : ' http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html ' ,
' md5 ' : ' e0d52040cafb07662acf3c9132db3575 ' ,
' info_dict ' : {
2016-10-16 19:21:02 +08:00
' id ' : ' 100000004709062 ' ,
' title ' : ' The Run-Up: ‘ He Was Like an Octopus’ ' ,
2016-10-15 10:16:43 +08:00
' ext ' : ' mp3 ' ,
2016-10-16 19:21:02 +08:00
' description ' : ' md5:fb5c6b93b12efc51649b4847fe066ee4 ' ,
' series ' : ' The Run-Up ' ,
' episode ' : ' ‘ He Was Like an Octopus’ ' ,
' episode_number ' : 20 ,
' duration ' : 2130 ,
2016-10-15 10:16:43 +08:00
}
} , {
' url ' : ' http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html ' ,
' info_dict ' : {
2016-10-16 19:21:02 +08:00
' id ' : ' 100000004709479 ' ,
' title ' : ' The Rise of Hitler ' ,
2016-10-15 10:16:43 +08:00
' ext ' : ' mp3 ' ,
2016-10-16 19:21:02 +08:00
' description ' : ' md5:bce877fd9e3444990cb141875fab0028 ' ,
' creator ' : ' Pamela Paul ' ,
' duration ' : 3475 ,
} ,
' params ' : {
' skip_download ' : True ,
} ,
2015-05-04 23:00:09 +08:00
} , {
' url ' : ' http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1 ' ,
' only_matching ' : True ,
} ]
2015-05-04 22:32:57 +08:00
2016-10-16 19:21:02 +08:00
def _extract_podcast_from_json ( self , json , page_id , webpage ) :
podcast_audio = self . _parse_json (
json , page_id , transform_source = js_to_json )
audio_data = podcast_audio [ ' data ' ]
track = audio_data [ ' track ' ]
episode_title = track [ ' title ' ]
video_url = track [ ' source ' ]
description = track . get ( ' description ' ) or self . _html_search_meta (
[ ' og:description ' , ' twitter:description ' ] , webpage )
podcast_title = audio_data . get ( ' podcast ' , { } ) . get ( ' title ' )
title = ( ' %s : %s ' % ( podcast_title , episode_title )
if podcast_title else episode_title )
episode = audio_data . get ( ' podcast ' , { } ) . get ( ' episode ' ) or ' '
episode_number = int_or_none ( self . _search_regex (
2016-10-17 23:16:23 +08:00
r ' [Ee]pisode \ s+( \ d+) ' , episode , ' episode number ' , default = None ) )
2016-10-16 19:21:02 +08:00
return {
' id ' : remove_start ( podcast_audio . get ( ' target ' ) , ' FT ' ) or page_id ,
' url ' : video_url ,
' title ' : title ,
' description ' : description ,
' creator ' : track . get ( ' credit ' ) ,
' series ' : podcast_title ,
' episode ' : episode_title ,
' episode_number ' : episode_number ,
' duration ' : int_or_none ( track . get ( ' duration ' ) ) ,
}
2015-05-04 22:32:57 +08:00
def _real_extract ( self , url ) :
2016-10-15 10:16:43 +08:00
page_id = self . _match_id ( url )
2015-05-04 22:32:57 +08:00
2016-10-15 10:16:43 +08:00
webpage = self . _download_webpage ( url , page_id )
2015-05-04 22:32:57 +08:00
2016-10-16 19:21:02 +08:00
video_id = self . _search_regex (
r ' data-videoid=[ " \' ]( \ d+) ' , webpage , ' video id ' ,
default = None , fatal = False )
2016-10-15 10:16:43 +08:00
if video_id is not None :
return self . _extract_video_from_id ( video_id )
2016-10-16 19:21:02 +08:00
podcast_data = self . _search_regex (
( r ' NYTD \ .FlexTypes \ .push \ s* \ ( \ s*( { .+?}) \ s* \ ) \ s*; \ s*</script ' ,
r ' NYTD \ .FlexTypes \ .push \ s* \ ( \ s*( { .+}) \ s* \ ) \ s*; ' ) ,
webpage , ' podcast data ' )
return self . _extract_podcast_from_json ( podcast_data , page_id , webpage )