2013-07-17 02:50:16 +08:00
# coding: utf-8
2014-01-17 10:32:02 +08:00
from __future__ import unicode_literals
2013-07-17 02:50:16 +08:00
import re
from . common import InfoExtractor
2014-12-13 19:24:42 +08:00
from . . compat import (
2013-07-17 02:50:16 +08:00
compat_urllib_parse ,
compat_urllib_parse_urlparse ,
compat_urlparse ,
)
2014-12-13 19:24:42 +08:00
from . . utils import (
orderedSet ,
)
2013-07-17 02:50:16 +08:00
class CondeNastIE ( InfoExtractor ) :
"""
Condé Nast is a media group , some of its sites use a custom HTML5 player
that works the same in all of them .
"""
# The keys are the supported sites and the values are the name to be shown
# to the user and in the extractor description.
2014-01-17 10:32:02 +08:00
_SITES = {
2015-09-24 00:48:39 +08:00
' allure ' : ' Allure ' ,
' architecturaldigest ' : ' Architectural Digest ' ,
' arstechnica ' : ' Ars Technica ' ,
2015-09-25 07:15:21 +08:00
' bonappetit ' : ' Bon Appétit ' ,
2015-09-24 00:48:39 +08:00
' brides ' : ' Brides ' ,
' cnevids ' : ' Condé Nast ' ,
' cntraveler ' : ' Condé Nast Traveler ' ,
' details ' : ' Details ' ,
' epicurious ' : ' Epicurious ' ,
' glamour ' : ' Glamour ' ,
' golfdigest ' : ' Golf Digest ' ,
2014-01-17 10:32:02 +08:00
' gq ' : ' GQ ' ,
2015-09-24 00:48:39 +08:00
' newyorker ' : ' The New Yorker ' ,
' self ' : ' SELF ' ,
' teenvogue ' : ' Teen Vogue ' ,
' vanityfair ' : ' Vanity Fair ' ,
2014-01-17 10:32:02 +08:00
' vogue ' : ' Vogue ' ,
2015-09-24 00:48:39 +08:00
' wired ' : ' WIRED ' ,
2014-01-17 10:32:02 +08:00
' wmagazine ' : ' W Magazine ' ,
}
2013-07-17 02:50:16 +08:00
2015-09-25 07:18:45 +08:00
_VALID_URL = r ' http://(?:video|www|player) \ .(?P<site> %s ) \ .com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+) ' % ' | ' . join ( _SITES . keys ( ) )
2014-01-17 10:32:02 +08:00
IE_DESC = ' Condé Nast media group: %s ' % ' , ' . join ( sorted ( _SITES . values ( ) ) )
2013-07-17 02:50:16 +08:00
2014-10-13 20:59:35 +08:00
EMBED_URL = r ' (?:https?:)?//player \ .(?P<site> %s ) \ .com/(?P<type>embed)/.+? ' % ' | ' . join ( _SITES . keys ( ) )
2013-07-17 02:50:16 +08:00
_TEST = {
2014-01-17 10:32:02 +08:00
' url ' : ' http://video.wired.com/watch/3d-printed-speakers-lit-with-led ' ,
' md5 ' : ' 1921f713ed48aabd715691f774c451f7 ' ,
' info_dict ' : {
2014-04-21 11:47:52 +08:00
' id ' : ' 5171b343c2b4c00dd0c1ccb3 ' ,
' ext ' : ' mp4 ' ,
2014-01-17 10:32:02 +08:00
' title ' : ' 3D Printed Speakers Lit With LED ' ,
' description ' : ' Check out these beautiful 3D printed LED speakers. You can \' t actually buy them, but LumiGeek is working on a board that will let you make you \' re own. ' ,
2013-07-17 02:50:16 +08:00
}
}
def _extract_series ( self , url , webpage ) :
title = self . _html_search_regex ( r ' <div class= " cne-series-info " >.*?<h1>(.+?)</h1> ' ,
2014-01-17 10:32:02 +08:00
webpage , ' series title ' , flags = re . DOTALL )
2013-07-17 02:50:16 +08:00
url_object = compat_urllib_parse_urlparse ( url )
base_url = ' %s :// %s ' % ( url_object . scheme , url_object . netloc )
m_paths = re . finditer ( r ' <p class= " cne-thumb-title " >.*?<a href= " (/watch/.+?)[ " \ ?] ' ,
webpage , flags = re . DOTALL )
paths = orderedSet ( m . group ( 1 ) for m in m_paths )
build_url = lambda path : compat_urlparse . urljoin ( base_url , path )
entries = [ self . url_result ( build_url ( path ) , ' CondeNast ' ) for path in paths ]
return self . playlist_result ( entries , playlist_title = title )
2014-04-21 11:47:52 +08:00
def _extract_video ( self , webpage , url_type ) :
if url_type != ' embed ' :
description = self . _html_search_regex (
[
r ' <div class= " cne-video-description " >(.+?)</div> ' ,
r ' <div class= " video-post-content " >(.+?)</div> ' ,
] ,
webpage , ' description ' , fatal = False , flags = re . DOTALL )
else :
description = None
2013-07-17 02:50:16 +08:00
params = self . _search_regex ( r ' var params = { (.+?)}[;,] ' , webpage ,
2014-01-17 10:32:02 +08:00
' player params ' , flags = re . DOTALL )
video_id = self . _search_regex ( r ' videoId: [ \' " ](.+?)[ \' " ] ' , params , ' video id ' )
player_id = self . _search_regex ( r ' playerId: [ \' " ](.+?)[ \' " ] ' , params , ' player id ' )
target = self . _search_regex ( r ' target: [ \' " ](.+?)[ \' " ] ' , params , ' target ' )
2013-07-17 02:50:16 +08:00
data = compat_urllib_parse . urlencode ( { ' videoId ' : video_id ,
' playerId ' : player_id ,
' target ' : target ,
} )
base_info_url = self . _search_regex ( r ' url = [ \' " ](.+?)[ \' " ][,;] ' ,
2014-01-17 10:32:02 +08:00
webpage , ' base info url ' ,
2013-07-17 02:50:16 +08:00
default = ' http://player.cnevids.com/player/loader.js? ' )
info_url = base_info_url + data
info_page = self . _download_webpage ( info_url , video_id ,
2014-01-17 10:32:02 +08:00
' Downloading video info ' )
2015-09-24 22:54:23 +08:00
video_info = self . _search_regex ( r ' var \ s+video \ s*= \ s*( { .+?}); ' , info_page , ' video info ' )
2015-09-24 00:48:39 +08:00
video_info = self . _parse_json ( video_info , video_id )
2013-07-17 02:50:16 +08:00
2014-01-17 10:36:03 +08:00
formats = [ {
' format_id ' : ' %s - %s ' % ( fdata [ ' type ' ] . split ( ' / ' ) [ - 1 ] , fdata [ ' quality ' ] ) ,
' url ' : fdata [ ' src ' ] ,
' ext ' : fdata [ ' type ' ] . split ( ' / ' ) [ - 1 ] ,
' quality ' : 1 if fdata [ ' quality ' ] == ' high ' else 0 ,
} for fdata in video_info [ ' sources ' ] [ 0 ] ]
self . _sort_formats ( formats )
2013-07-17 02:50:16 +08:00
2014-01-17 10:36:03 +08:00
return {
' id ' : video_id ,
' formats ' : formats ,
' title ' : video_info [ ' title ' ] ,
' thumbnail ' : video_info [ ' poster_frame ' ] ,
' description ' : description ,
}
2013-07-17 02:50:16 +08:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
site = mobj . group ( ' site ' )
url_type = mobj . group ( ' type ' )
2014-04-21 11:47:52 +08:00
item_id = mobj . group ( ' id ' )
2013-07-17 02:50:16 +08:00
2014-04-21 11:47:52 +08:00
self . to_screen ( ' Extracting from %s with the Condé Nast extractor ' % self . _SITES [ site ] )
webpage = self . _download_webpage ( url , item_id )
2013-07-17 02:50:16 +08:00
if url_type == ' series ' :
return self . _extract_series ( url , webpage )
else :
2014-04-21 11:47:52 +08:00
return self . _extract_video ( webpage , url_type )