2013-07-17 02:50:16 +08:00
# coding: utf-8
2014-01-17 10:32:02 +08:00
from __future__ import unicode_literals
2013-07-17 02:50:16 +08:00
import re
from . common import InfoExtractor
2014-12-13 19:24:42 +08:00
from . . compat import (
2013-07-17 02:50:16 +08:00
compat_urllib_parse_urlparse ,
compat_urlparse ,
)
2014-12-13 19:24:42 +08:00
from . . utils import (
orderedSet ,
2015-09-27 07:53:21 +08:00
remove_end ,
2016-08-05 01:28:49 +08:00
extract_attributes ,
mimetype2ext ,
determine_ext ,
int_or_none ,
parse_iso8601 ,
2014-12-13 19:24:42 +08:00
)
2013-07-17 02:50:16 +08:00
class CondeNastIE ( InfoExtractor ) :
"""
Condé Nast is a media group , some of its sites use a custom HTML5 player
that works the same in all of them .
"""
# The keys are the supported sites and the values are the name to be shown
# to the user and in the extractor description.
2014-01-17 10:32:02 +08:00
_SITES = {
2015-09-24 00:48:39 +08:00
' allure ' : ' Allure ' ,
' architecturaldigest ' : ' Architectural Digest ' ,
' arstechnica ' : ' Ars Technica ' ,
2015-09-25 07:15:21 +08:00
' bonappetit ' : ' Bon Appétit ' ,
2015-09-24 00:48:39 +08:00
' brides ' : ' Brides ' ,
' cnevids ' : ' Condé Nast ' ,
' cntraveler ' : ' Condé Nast Traveler ' ,
' details ' : ' Details ' ,
' epicurious ' : ' Epicurious ' ,
' glamour ' : ' Glamour ' ,
' golfdigest ' : ' Golf Digest ' ,
2014-01-17 10:32:02 +08:00
' gq ' : ' GQ ' ,
2015-09-24 00:48:39 +08:00
' newyorker ' : ' The New Yorker ' ,
' self ' : ' SELF ' ,
' teenvogue ' : ' Teen Vogue ' ,
' vanityfair ' : ' Vanity Fair ' ,
2014-01-17 10:32:02 +08:00
' vogue ' : ' Vogue ' ,
2015-09-24 00:48:39 +08:00
' wired ' : ' WIRED ' ,
2014-01-17 10:32:02 +08:00
' wmagazine ' : ' W Magazine ' ,
}
2013-07-17 02:50:16 +08:00
2016-03-21 23:36:32 +08:00
_VALID_URL = r ' https?://(?:video|www|player) \ .(?P<site> %s ) \ .com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+) ' % ' | ' . join ( _SITES . keys ( ) )
2014-01-17 10:32:02 +08:00
IE_DESC = ' Condé Nast media group: %s ' % ' , ' . join ( sorted ( _SITES . values ( ) ) )
2013-07-17 02:50:16 +08:00
2015-09-27 07:53:21 +08:00
EMBED_URL = r ' (?:https?:)?//player \ .(?P<site> %s ) \ .com/(?P<type>embed(?:js)?)/.+? ' % ' | ' . join ( _SITES . keys ( ) )
2014-10-13 20:59:35 +08:00
2015-09-27 07:53:21 +08:00
_TESTS = [ {
2014-01-17 10:32:02 +08:00
' url ' : ' http://video.wired.com/watch/3d-printed-speakers-lit-with-led ' ,
' md5 ' : ' 1921f713ed48aabd715691f774c451f7 ' ,
' info_dict ' : {
2014-04-21 11:47:52 +08:00
' id ' : ' 5171b343c2b4c00dd0c1ccb3 ' ,
' ext ' : ' mp4 ' ,
2014-01-17 10:32:02 +08:00
' title ' : ' 3D Printed Speakers Lit With LED ' ,
' description ' : ' Check out these beautiful 3D printed LED speakers. You can \' t actually buy them, but LumiGeek is working on a board that will let you make you \' re own. ' ,
2016-08-05 01:28:49 +08:00
' uploader ' : ' wired ' ,
' upload_date ' : ' 20130314 ' ,
' timestamp ' : 1363219200 ,
2013-07-17 02:50:16 +08:00
}
2015-09-27 07:53:21 +08:00
} , {
# JS embed
' url ' : ' http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js ' ,
' md5 ' : ' f1a6f9cafb7083bab74a710f65d08999 ' ,
' info_dict ' : {
' id ' : ' 55f9cf8b61646d1acf00000c ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 3D printed TSA Travel Sentry keys really do open TSA locks ' ,
2016-08-05 01:28:49 +08:00
' uploader ' : ' arstechnica ' ,
' upload_date ' : ' 20150916 ' ,
' timestamp ' : 1442434955 ,
2015-09-27 07:53:21 +08:00
}
} ]
2013-07-17 02:50:16 +08:00
def _extract_series ( self , url , webpage ) :
2016-08-05 01:28:49 +08:00
title = self . _html_search_regex (
r ' (?s)<div class= " cne-series-info " >.*?<h1>(.+?)</h1> ' ,
webpage , ' series title ' )
2013-07-17 02:50:16 +08:00
url_object = compat_urllib_parse_urlparse ( url )
base_url = ' %s :// %s ' % ( url_object . scheme , url_object . netloc )
2016-08-05 01:28:49 +08:00
m_paths = re . finditer (
r ' (?s)<p class= " cne-thumb-title " >.*?<a href= " (/watch/.+?)[ " \ ?] ' , webpage )
2013-07-17 02:50:16 +08:00
paths = orderedSet ( m . group ( 1 ) for m in m_paths )
build_url = lambda path : compat_urlparse . urljoin ( base_url , path )
entries = [ self . url_result ( build_url ( path ) , ' CondeNast ' ) for path in paths ]
return self . playlist_result ( entries , playlist_title = title )
2014-04-21 11:47:52 +08:00
def _extract_video ( self , webpage , url_type ) :
2016-08-05 01:28:49 +08:00
query = { }
params = self . _search_regex (
r ' (?s)var params = { (.+?)}[;,] ' , webpage , ' player params ' , default = None )
if params :
query . update ( {
' videoId ' : self . _search_regex ( r ' videoId: [ \' " ](.+?)[ \' " ] ' , params , ' video id ' ) ,
' playerId ' : self . _search_regex ( r ' playerId: [ \' " ](.+?)[ \' " ] ' , params , ' player id ' ) ,
' target ' : self . _search_regex ( r ' target: [ \' " ](.+?)[ \' " ] ' , params , ' target ' ) ,
} )
2014-04-21 11:47:52 +08:00
else :
2016-08-05 01:28:49 +08:00
params = extract_attributes ( self . _search_regex (
r ' (<[^>]+data-js= " video-player " [^>]+>) ' ,
webpage , ' player params element ' ) )
query . update ( {
' videoId ' : params [ ' data-video ' ] ,
' playerId ' : params [ ' data-player ' ] ,
' target ' : params [ ' id ' ] ,
} )
video_id = query [ ' videoId ' ]
info_page = self . _download_webpage (
' http://player.cnevids.com/player/video.js ' ,
video_id , ' Downloading video info ' , query = query )
video_info = self . _parse_json ( self . _search_regex (
r ' loadCallback \ (( { .+}) \ ) ' , info_page , ' video info ' ) , video_id ) [ ' video ' ]
title = video_info [ ' title ' ]
formats = [ ]
for fdata in video_info . get ( ' sources ' , [ { } ] ) [ 0 ] :
src = fdata . get ( ' src ' )
if not src :
continue
ext = mimetype2ext ( fdata . get ( ' type ' ) ) or determine_ext ( src )
quality = fdata . get ( ' quality ' )
formats . append ( {
' format_id ' : ext + ( ' - %s ' % quality if quality else ' ' ) ,
' url ' : src ,
' ext ' : ext ,
' quality ' : 1 if quality == ' high ' else 0 ,
} )
2014-01-17 10:36:03 +08:00
self . _sort_formats ( formats )
2013-07-17 02:50:16 +08:00
2016-08-05 01:28:49 +08:00
info = self . _search_json_ld ( webpage , video_id ) if url_type != ' embed ' else { }
info . update ( {
2014-01-17 10:36:03 +08:00
' id ' : video_id ,
' formats ' : formats ,
2016-08-05 01:28:49 +08:00
' title ' : title ,
' thumbnail ' : video_info . get ( ' poster_frame ' ) ,
' uploader ' : video_info . get ( ' brand ' ) ,
' duration ' : int_or_none ( video_info . get ( ' duration ' ) ) ,
' tags ' : video_info . get ( ' tags ' ) ,
' series ' : video_info . get ( ' series_title ' ) ,
' season ' : video_info . get ( ' season_title ' ) ,
' timestamp ' : parse_iso8601 ( video_info . get ( ' premiere_date ' ) ) ,
} )
return info
2013-07-17 02:50:16 +08:00
def _real_extract ( self , url ) :
2016-08-05 01:28:49 +08:00
site , url_type , item_id = re . match ( self . _VALID_URL , url ) . groups ( )
2013-07-17 02:50:16 +08:00
2015-09-27 07:53:21 +08:00
# Convert JS embed to regular embed
if url_type == ' embedjs ' :
parsed_url = compat_urlparse . urlparse ( url )
url = compat_urlparse . urlunparse ( parsed_url . _replace (
path = remove_end ( parsed_url . path , ' .js ' ) . replace ( ' /embedjs/ ' , ' /embed/ ' ) ) )
url_type = ' embed '
2014-04-21 11:47:52 +08:00
self . to_screen ( ' Extracting from %s with the Condé Nast extractor ' % self . _SITES [ site ] )
webpage = self . _download_webpage ( url , item_id )
2013-07-17 02:50:16 +08:00
if url_type == ' series ' :
return self . _extract_series ( url , webpage )
else :
2014-04-21 11:47:52 +08:00
return self . _extract_video ( webpage , url_type )