2013-07-17 02:50:16 +08:00
# coding: utf-8
2014-01-17 10:32:02 +08:00
from __future__ import unicode_literals
2013-07-17 02:50:16 +08:00
import re
from . common import InfoExtractor
2014-12-13 19:24:42 +08:00
from . . compat import (
2013-07-17 02:50:16 +08:00
compat_urllib_parse_urlparse ,
compat_urlparse ,
)
2014-12-13 19:24:42 +08:00
from . . utils import (
2016-08-05 01:28:49 +08:00
determine_ext ,
2017-03-23 00:22:14 +08:00
extract_attributes ,
2016-08-05 01:28:49 +08:00
int_or_none ,
2017-03-23 00:22:14 +08:00
js_to_json ,
mimetype2ext ,
orderedSet ,
2016-08-05 01:28:49 +08:00
parse_iso8601 ,
2017-03-23 00:22:14 +08:00
remove_end ,
2014-12-13 19:24:42 +08:00
)
2013-07-17 02:50:16 +08:00
class CondeNastIE ( InfoExtractor ) :
"""
Condé Nast is a media group , some of its sites use a custom HTML5 player
that works the same in all of them .
"""
# The keys are the supported sites and the values are the name to be shown
# to the user and in the extractor description.
2014-01-17 10:32:02 +08:00
_SITES = {
2015-09-24 00:48:39 +08:00
' allure ' : ' Allure ' ,
' architecturaldigest ' : ' Architectural Digest ' ,
' arstechnica ' : ' Ars Technica ' ,
2015-09-25 07:15:21 +08:00
' bonappetit ' : ' Bon Appétit ' ,
2015-09-24 00:48:39 +08:00
' brides ' : ' Brides ' ,
' cnevids ' : ' Condé Nast ' ,
' cntraveler ' : ' Condé Nast Traveler ' ,
' details ' : ' Details ' ,
' epicurious ' : ' Epicurious ' ,
' glamour ' : ' Glamour ' ,
' golfdigest ' : ' Golf Digest ' ,
2014-01-17 10:32:02 +08:00
' gq ' : ' GQ ' ,
2015-09-24 00:48:39 +08:00
' newyorker ' : ' The New Yorker ' ,
' self ' : ' SELF ' ,
' teenvogue ' : ' Teen Vogue ' ,
' vanityfair ' : ' Vanity Fair ' ,
2014-01-17 10:32:02 +08:00
' vogue ' : ' Vogue ' ,
2015-09-24 00:48:39 +08:00
' wired ' : ' WIRED ' ,
2014-01-17 10:32:02 +08:00
' wmagazine ' : ' W Magazine ' ,
}
2013-07-17 02:50:16 +08:00
2016-03-21 23:36:32 +08:00
_VALID_URL = r ' https?://(?:video|www|player) \ .(?P<site> %s ) \ .com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+) ' % ' | ' . join ( _SITES . keys ( ) )
2014-01-17 10:32:02 +08:00
IE_DESC = ' Condé Nast media group: %s ' % ' , ' . join ( sorted ( _SITES . values ( ) ) )
2013-07-17 02:50:16 +08:00
2015-09-27 07:53:21 +08:00
EMBED_URL = r ' (?:https?:)?//player \ .(?P<site> %s ) \ .com/(?P<type>embed(?:js)?)/.+? ' % ' | ' . join ( _SITES . keys ( ) )
2014-10-13 20:59:35 +08:00
2015-09-27 07:53:21 +08:00
_TESTS = [ {
2014-01-17 10:32:02 +08:00
' url ' : ' http://video.wired.com/watch/3d-printed-speakers-lit-with-led ' ,
' md5 ' : ' 1921f713ed48aabd715691f774c451f7 ' ,
' info_dict ' : {
2014-04-21 11:47:52 +08:00
' id ' : ' 5171b343c2b4c00dd0c1ccb3 ' ,
' ext ' : ' mp4 ' ,
2014-01-17 10:32:02 +08:00
' title ' : ' 3D Printed Speakers Lit With LED ' ,
' description ' : ' Check out these beautiful 3D printed LED speakers. You can \' t actually buy them, but LumiGeek is working on a board that will let you make you \' re own. ' ,
2016-08-05 01:28:49 +08:00
' uploader ' : ' wired ' ,
' upload_date ' : ' 20130314 ' ,
' timestamp ' : 1363219200 ,
2013-07-17 02:50:16 +08:00
}
2017-03-23 00:22:14 +08:00
} , {
' url ' : ' http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series ' ,
' info_dict ' : {
' id ' : ' 58d1865bfd2e6126e2000015 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Only True Surprise? Trump’ s an Idiot ' ,
' uploader ' : ' gq ' ,
' upload_date ' : ' 20170321 ' ,
' timestamp ' : 1490126427 ,
} ,
2015-09-27 07:53:21 +08:00
} , {
# JS embed
' url ' : ' http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js ' ,
' md5 ' : ' f1a6f9cafb7083bab74a710f65d08999 ' ,
' info_dict ' : {
' id ' : ' 55f9cf8b61646d1acf00000c ' ,
' ext ' : ' mp4 ' ,
' title ' : ' 3D printed TSA Travel Sentry keys really do open TSA locks ' ,
2016-08-05 01:28:49 +08:00
' uploader ' : ' arstechnica ' ,
' upload_date ' : ' 20150916 ' ,
' timestamp ' : 1442434955 ,
2015-09-27 07:53:21 +08:00
}
} ]
2013-07-17 02:50:16 +08:00
def _extract_series ( self , url , webpage ) :
2016-08-05 01:28:49 +08:00
title = self . _html_search_regex (
r ' (?s)<div class= " cne-series-info " >.*?<h1>(.+?)</h1> ' ,
webpage , ' series title ' )
2013-07-17 02:50:16 +08:00
url_object = compat_urllib_parse_urlparse ( url )
base_url = ' %s :// %s ' % ( url_object . scheme , url_object . netloc )
2016-08-05 01:28:49 +08:00
m_paths = re . finditer (
r ' (?s)<p class= " cne-thumb-title " >.*?<a href= " (/watch/.+?)[ " \ ?] ' , webpage )
2013-07-17 02:50:16 +08:00
paths = orderedSet ( m . group ( 1 ) for m in m_paths )
build_url = lambda path : compat_urlparse . urljoin ( base_url , path )
entries = [ self . url_result ( build_url ( path ) , ' CondeNast ' ) for path in paths ]
return self . playlist_result ( entries , playlist_title = title )
2014-04-21 11:47:52 +08:00
def _extract_video ( self , webpage , url_type ) :
2016-08-05 01:28:49 +08:00
query = { }
params = self . _search_regex (
r ' (?s)var params = { (.+?)}[;,] ' , webpage , ' player params ' , default = None )
if params :
query . update ( {
' videoId ' : self . _search_regex ( r ' videoId: [ \' " ](.+?)[ \' " ] ' , params , ' video id ' ) ,
' playerId ' : self . _search_regex ( r ' playerId: [ \' " ](.+?)[ \' " ] ' , params , ' player id ' ) ,
' target ' : self . _search_regex ( r ' target: [ \' " ](.+?)[ \' " ] ' , params , ' target ' ) ,
} )
2014-04-21 11:47:52 +08:00
else :
2016-08-05 01:28:49 +08:00
params = extract_attributes ( self . _search_regex (
r ' (<[^>]+data-js= " video-player " [^>]+>) ' ,
webpage , ' player params element ' ) )
query . update ( {
' videoId ' : params [ ' data-video ' ] ,
' playerId ' : params [ ' data-player ' ] ,
' target ' : params [ ' id ' ] ,
} )
video_id = query [ ' videoId ' ]
2016-08-06 04:01:16 +08:00
video_info = None
2017-03-23 00:22:14 +08:00
info_page = self . _download_json (
2016-08-05 01:28:49 +08:00
' http://player.cnevids.com/player/video.js ' ,
2017-03-23 00:22:14 +08:00
video_id , ' Downloading video info ' , fatal = False , query = query )
2016-08-06 04:01:16 +08:00
if info_page :
2017-03-23 00:22:14 +08:00
video_info = info_page . get ( ' video ' )
if not video_info :
2016-08-06 04:01:16 +08:00
info_page = self . _download_webpage (
' http://player.cnevids.com/player/loader.js ' ,
video_id , ' Downloading loader info ' , query = query )
2017-03-23 00:22:14 +08:00
video_info = self . _parse_json (
self . _search_regex (
r ' (?s)var \ s+config \ s*= \ s*( { .+?}); ' , info_page , ' config ' ) ,
video_id , transform_source = js_to_json ) [ ' video ' ]
2016-08-05 01:28:49 +08:00
title = video_info [ ' title ' ]
formats = [ ]
2017-03-23 00:22:14 +08:00
for fdata in video_info [ ' sources ' ] :
2016-08-05 01:28:49 +08:00
src = fdata . get ( ' src ' )
if not src :
continue
ext = mimetype2ext ( fdata . get ( ' type ' ) ) or determine_ext ( src )
2017-03-23 00:22:14 +08:00
if ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
src , video_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = ' hls ' , fatal = False ) )
continue
2016-08-05 01:28:49 +08:00
quality = fdata . get ( ' quality ' )
formats . append ( {
' format_id ' : ext + ( ' - %s ' % quality if quality else ' ' ) ,
' url ' : src ,
' ext ' : ext ,
' quality ' : 1 if quality == ' high ' else 0 ,
} )
2014-01-17 10:36:03 +08:00
self . _sort_formats ( formats )
2013-07-17 02:50:16 +08:00
2016-08-08 23:45:49 +08:00
info = self . _search_json_ld (
webpage , video_id , fatal = False ) if url_type != ' embed ' else { }
2016-08-05 01:28:49 +08:00
info . update ( {
2014-01-17 10:36:03 +08:00
' id ' : video_id ,
' formats ' : formats ,
2016-08-05 01:28:49 +08:00
' title ' : title ,
' thumbnail ' : video_info . get ( ' poster_frame ' ) ,
' uploader ' : video_info . get ( ' brand ' ) ,
' duration ' : int_or_none ( video_info . get ( ' duration ' ) ) ,
' tags ' : video_info . get ( ' tags ' ) ,
' series ' : video_info . get ( ' series_title ' ) ,
' season ' : video_info . get ( ' season_title ' ) ,
' timestamp ' : parse_iso8601 ( video_info . get ( ' premiere_date ' ) ) ,
} )
return info
2013-07-17 02:50:16 +08:00
def _real_extract ( self , url ) :
2016-08-05 01:28:49 +08:00
site , url_type , item_id = re . match ( self . _VALID_URL , url ) . groups ( )
2013-07-17 02:50:16 +08:00
2015-09-27 07:53:21 +08:00
# Convert JS embed to regular embed
if url_type == ' embedjs ' :
parsed_url = compat_urlparse . urlparse ( url )
url = compat_urlparse . urlunparse ( parsed_url . _replace (
path = remove_end ( parsed_url . path , ' .js ' ) . replace ( ' /embedjs/ ' , ' /embed/ ' ) ) )
url_type = ' embed '
2014-04-21 11:47:52 +08:00
webpage = self . _download_webpage ( url , item_id )
2013-07-17 02:50:16 +08:00
if url_type == ' series ' :
return self . _extract_series ( url , webpage )
else :
2014-04-21 11:47:52 +08:00
return self . _extract_video ( webpage , url_type )