2013-06-24 01:58:33 +08:00
# coding: utf-8
2014-09-13 13:51:06 +08:00
from __future__ import unicode_literals
2013-09-22 16:37:23 +08:00
import itertools
2013-06-24 01:58:33 +08:00
import json
2013-09-22 06:35:03 +08:00
import os . path
2013-06-24 01:58:33 +08:00
import re
2014-11-30 07:03:59 +08:00
import time
2013-09-21 20:19:30 +08:00
import traceback
2013-06-24 01:58:33 +08:00
2013-06-24 02:28:15 +08:00
from . common import InfoExtractor , SearchInfoExtractor
2014-03-30 13:02:58 +08:00
from . . jsinterp import JSInterpreter
2014-07-18 16:24:28 +08:00
from . . swfinterp import SWFInterpreter
2014-12-11 17:08:17 +08:00
from . . compat import (
2013-09-22 16:30:02 +08:00
compat_chr ,
2013-06-24 01:58:33 +08:00
compat_parse_qs ,
compat_urllib_parse ,
compat_urllib_request ,
2013-10-01 23:58:13 +08:00
compat_urlparse ,
2013-06-24 01:58:33 +08:00
compat_str ,
2014-12-11 17:08:17 +08:00
)
from . . utils import (
2013-06-24 01:58:33 +08:00
clean_html ,
ExtractorError ,
2015-02-12 01:39:31 +08:00
float_or_none ,
2014-12-11 17:08:17 +08:00
get_element_by_attribute ,
get_element_by_id ,
2014-01-19 12:47:20 +08:00
int_or_none ,
2014-12-11 17:08:17 +08:00
orderedSet ,
2015-06-29 02:48:06 +08:00
str_to_int ,
2013-06-24 01:58:33 +08:00
unescapeHTML ,
unified_strdate ,
2014-02-10 00:56:10 +08:00
uppercase_escape ,
2015-06-27 13:15:57 +08:00
ISO3166Utils ,
2013-06-24 01:58:33 +08:00
)
2014-11-24 03:41:03 +08:00
2013-09-11 21:48:23 +08:00
class YoutubeBaseInfoExtractor ( InfoExtractor ) :
2013-07-25 02:40:12 +08:00
""" Provide base functions for Youtube extractors """
_LOGIN_URL = ' https://accounts.google.com/ServiceLogin '
2014-08-17 05:28:41 +08:00
_TWOFACTOR_URL = ' https://accounts.google.com/SecondFactor '
2013-07-25 02:40:12 +08:00
_NETRC_MACHINE = ' youtube '
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
def _set_language ( self ) :
2014-12-04 15:27:40 +08:00
self . _set_cookie (
' .youtube.com ' , ' PREF ' , ' f1=50000000&hl=en ' ,
2014-11-30 07:03:59 +08:00
# YouTube sets the expire time to about two months
2014-12-04 15:27:40 +08:00
expire_time = time . time ( ) + 2 * 30 * 24 * 3600 )
2013-07-25 02:40:12 +08:00
2015-05-15 23:06:59 +08:00
def _ids_to_results ( self , ids ) :
return [
self . url_result ( vid_id , ' Youtube ' , video_id = vid_id )
for vid_id in ids ]
2013-07-25 02:40:12 +08:00
def _login ( self ) :
2014-08-17 05:28:41 +08:00
"""
Attempt to log in to YouTube .
True is returned if successful or skipped .
False is returned if login failed .
If _LOGIN_REQUIRED is set and no authentication was provided , an error is raised .
"""
2013-07-25 02:40:12 +08:00
( username , password ) = self . _get_login_info ( )
# No authentication to be performed
if username is None :
if self . _LOGIN_REQUIRED :
2014-09-24 15:51:45 +08:00
raise ExtractorError ( ' No login info available, needed for using %s . ' % self . IE_NAME , expected = True )
2014-08-17 05:28:41 +08:00
return True
2013-07-25 02:40:12 +08:00
2013-12-09 08:49:01 +08:00
login_page = self . _download_webpage (
self . _LOGIN_URL , None ,
2014-09-24 15:51:45 +08:00
note = ' Downloading login page ' ,
errnote = ' unable to fetch login page ' , fatal = False )
2013-12-09 08:49:01 +08:00
if login_page is False :
return
2013-07-25 02:40:12 +08:00
2013-10-29 13:45:54 +08:00
galx = self . _search_regex ( r ' (?s)<input.+?name= " GALX " .+?value= " (.+?) " ' ,
2014-09-13 13:51:06 +08:00
login_page , ' Login GALX parameter ' )
2013-06-24 01:58:33 +08:00
2013-07-25 02:40:12 +08:00
# Log in
login_form_strs = {
2014-11-24 04:20:46 +08:00
' continue ' : ' https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1 ' ,
' Email ' : username ,
' GALX ' : galx ,
' Passwd ' : password ,
' PersistentCookie ' : ' yes ' ,
' _utf8 ' : ' 霱 ' ,
' bgresponse ' : ' js_disabled ' ,
' checkConnection ' : ' ' ,
' checkedDomains ' : ' youtube ' ,
' dnConn ' : ' ' ,
' pstMsg ' : ' 0 ' ,
' rmShown ' : ' 1 ' ,
' secTok ' : ' ' ,
' signIn ' : ' Sign in ' ,
' timeStmp ' : ' ' ,
' service ' : ' youtube ' ,
' uilel ' : ' 3 ' ,
' hl ' : ' en_US ' ,
2013-07-25 02:40:12 +08:00
}
2014-08-17 05:28:41 +08:00
2013-07-25 02:40:12 +08:00
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
2014-11-24 03:41:03 +08:00
login_form = dict ( ( k . encode ( ' utf-8 ' ) , v . encode ( ' utf-8 ' ) ) for k , v in login_form_strs . items ( ) )
2013-07-25 02:40:12 +08:00
login_data = compat_urllib_parse . urlencode ( login_form ) . encode ( ' ascii ' )
2013-12-09 08:49:01 +08:00
req = compat_urllib_request . Request ( self . _LOGIN_URL , login_data )
login_results = self . _download_webpage (
req , None ,
2014-09-24 15:51:45 +08:00
note = ' Logging in ' , errnote = ' unable to log in ' , fatal = False )
2013-12-09 08:49:01 +08:00
if login_results is False :
return False
2014-08-17 05:28:41 +08:00
if re . search ( r ' id= " errormsg_0_Passwd " ' , login_results ) is not None :
2014-09-24 15:51:45 +08:00
raise ExtractorError ( ' Please use your account password and a two-factor code instead of an application-specific password. ' , expected = True )
2014-08-17 05:28:41 +08:00
# Two-Factor
# TODO add SMS and phone call support - these require making a request and then prompting the user
if re . search ( r ' (?i)<form[^>]* id= " gaia_secondfactorform " ' , login_results ) is not None :
tfa_code = self . _get_tfa_info ( )
if tfa_code is None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' Two-factor authentication required. Provide it with --twofactor <code> ' )
self . _downloader . report_warning ( ' (Note that only TOTP (Google Authenticator App) codes work at this time.) ' )
2014-08-17 05:28:41 +08:00
return False
# Unlike the first login form, secTok and timeStmp are both required for the TFA form
match = re . search ( r ' id= " secTok " \ n \ s+value= \' (.+) \' /> ' , login_results , re . M | re . U )
if match is None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' Failed to get secTok - did the page structure change? ' )
2014-08-17 05:28:41 +08:00
secTok = match . group ( 1 )
match = re . search ( r ' id= " timeStmp " \ n \ s+value= \' (.+) \' /> ' , login_results , re . M | re . U )
if match is None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' Failed to get timeStmp - did the page structure change? ' )
2014-08-17 05:28:41 +08:00
timeStmp = match . group ( 1 )
tfa_form_strs = {
2014-09-13 13:51:06 +08:00
' continue ' : ' https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1 ' ,
' smsToken ' : ' ' ,
' smsUserPin ' : tfa_code ,
' smsVerifyPin ' : ' Verify ' ,
' PersistentCookie ' : ' yes ' ,
' checkConnection ' : ' ' ,
' checkedDomains ' : ' youtube ' ,
' pstMsg ' : ' 1 ' ,
' secTok ' : secTok ,
' timeStmp ' : timeStmp ,
' service ' : ' youtube ' ,
' hl ' : ' en_US ' ,
2014-08-17 05:28:41 +08:00
}
2014-11-24 03:41:03 +08:00
tfa_form = dict ( ( k . encode ( ' utf-8 ' ) , v . encode ( ' utf-8 ' ) ) for k , v in tfa_form_strs . items ( ) )
2014-08-17 05:28:41 +08:00
tfa_data = compat_urllib_parse . urlencode ( tfa_form ) . encode ( ' ascii ' )
tfa_req = compat_urllib_request . Request ( self . _TWOFACTOR_URL , tfa_data )
tfa_results = self . _download_webpage (
tfa_req , None ,
2014-09-24 15:51:45 +08:00
note = ' Submitting TFA code ' , errnote = ' unable to submit tfa ' , fatal = False )
2014-08-17 05:28:41 +08:00
if tfa_results is False :
return False
if re . search ( r ' (?i)<form[^>]* id= " gaia_secondfactorform " ' , tfa_results ) is not None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' Two-factor code expired. Please try again, or use a one-use backup code instead. ' )
2014-08-17 05:28:41 +08:00
return False
if re . search ( r ' (?i)<form[^>]* id= " gaia_loginform " ' , tfa_results ) is not None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' unable to log in - did the page structure change? ' )
2014-08-17 05:28:41 +08:00
return False
if re . search ( r ' smsauth-interstitial-reviewsettings ' , tfa_results ) is not None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again. ' )
2014-08-17 05:28:41 +08:00
return False
2013-12-09 08:49:01 +08:00
if re . search ( r ' (?i)<form[^>]* id= " gaia_loginform " ' , login_results ) is not None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' unable to log in: bad username or password ' )
2013-07-25 02:40:12 +08:00
return False
return True
def _real_initialize ( self ) :
if self . _downloader is None :
return
2014-11-30 07:03:59 +08:00
self . _set_language ( )
2013-07-25 02:40:12 +08:00
if not self . _login ( ) :
return
2013-06-24 01:58:33 +08:00
2013-08-08 14:54:10 +08:00
2015-02-17 04:44:17 +08:00
class YoutubeIE ( YoutubeBaseInfoExtractor ) :
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com '
2013-11-18 23:42:35 +08:00
_VALID_URL = r """ (?x)^
2013-06-24 01:58:33 +08:00
(
2014-09-12 03:47:25 +08:00
( ? : https ? : / / | / / ) # http(s):// or protocol-independent URL
2013-11-18 23:42:35 +08:00
( ? : ( ? : ( ? : ( ? : \w + \. ) ? [ yY ] [ oO ] [ uU ] [ tT ] [ uU ] [ bB ] [ eE ] ( ? : - nocookie ) ? \. com / |
2014-01-17 09:53:34 +08:00
( ? : www \. ) ? deturl \. com / www \. youtube \. com / |
2014-02-10 08:30:47 +08:00
( ? : www \. ) ? pwnyoutube \. com / |
2014-02-19 03:00:54 +08:00
( ? : www \. ) ? yourepeat \. com / |
2013-09-15 18:14:59 +08:00
tube \. majestyc \. net / |
youtube \. googleapis \. com / ) # the various hostnames, with wildcard subdomains
2013-06-24 01:58:33 +08:00
( ? : . * ? \#/)? # handle anchor (#/) redirect urls
( ? : # the various things that can precede the ID:
2014-09-24 16:34:29 +08:00
( ? : ( ? : v | embed | e ) / ( ? ! videoseries ) ) # v/ or embed/ or e/
2013-06-24 01:58:33 +08:00
| ( ? : # or the v= param in all its forms
2014-02-19 03:00:54 +08:00
( ? : ( ? : watch | movie ) ( ? : _popup ) ? ( ? : \. php ) ? / ? ) ? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
2013-06-24 01:58:33 +08:00
( ? : \? | \#!?) # the params delimiter ? or # or #!
( ? : . * ? & ) ? # any other preceding param (like /?s=tuff&v=xxxx)
v =
)
2013-09-06 04:38:23 +08:00
) )
| youtu \. be / # just youtu.be/xxxx
2014-09-12 03:47:25 +08:00
| ( ? : www \. ) ? cleanvideosearch \. com / media / action / yt / watch \? videoId =
2013-09-06 04:38:23 +08:00
)
2013-06-24 01:58:33 +08:00
) ? # all until now is optional -> you can pass the naked ID
2013-09-09 16:33:12 +08:00
( [ 0 - 9 A - Za - z_ - ] { 11 } ) # here is it! the YouTube video ID
2014-09-13 13:31:48 +08:00
( ? ! . * ? & list = ) # combined list/video URLs are handled by the playlist IE
2013-06-24 01:58:33 +08:00
( ? ( 1 ) . + ) ? # if we found the ID, everything can follow
$ """
_NEXT_URL_RE = r ' [ \ ?&]next_url=([^&]+) '
2013-12-24 19:34:09 +08:00
_formats = {
' 5 ' : { ' ext ' : ' flv ' , ' width ' : 400 , ' height ' : 240 } ,
' 6 ' : { ' ext ' : ' flv ' , ' width ' : 450 , ' height ' : 270 } ,
' 13 ' : { ' ext ' : ' 3gp ' } ,
' 17 ' : { ' ext ' : ' 3gp ' , ' width ' : 176 , ' height ' : 144 } ,
' 18 ' : { ' ext ' : ' mp4 ' , ' width ' : 640 , ' height ' : 360 } ,
' 22 ' : { ' ext ' : ' mp4 ' , ' width ' : 1280 , ' height ' : 720 } ,
' 34 ' : { ' ext ' : ' flv ' , ' width ' : 640 , ' height ' : 360 } ,
' 35 ' : { ' ext ' : ' flv ' , ' width ' : 854 , ' height ' : 480 } ,
' 36 ' : { ' ext ' : ' 3gp ' , ' width ' : 320 , ' height ' : 240 } ,
' 37 ' : { ' ext ' : ' mp4 ' , ' width ' : 1920 , ' height ' : 1080 } ,
' 38 ' : { ' ext ' : ' mp4 ' , ' width ' : 4096 , ' height ' : 3072 } ,
' 43 ' : { ' ext ' : ' webm ' , ' width ' : 640 , ' height ' : 360 } ,
' 44 ' : { ' ext ' : ' webm ' , ' width ' : 854 , ' height ' : 480 } ,
' 45 ' : { ' ext ' : ' webm ' , ' width ' : 1280 , ' height ' : 720 } ,
' 46 ' : { ' ext ' : ' webm ' , ' width ' : 1920 , ' height ' : 1080 } ,
2015-06-19 22:52:44 +08:00
' 59 ' : { ' ext ' : ' mp4 ' , ' width ' : 854 , ' height ' : 480 } ,
' 78 ' : { ' ext ' : ' mp4 ' , ' width ' : 854 , ' height ' : 480 } ,
2013-12-24 19:34:09 +08:00
2013-07-20 18:46:02 +08:00
2013-08-02 01:47:48 +08:00
# 3d videos
2014-03-22 21:22:39 +08:00
' 82 ' : { ' ext ' : ' mp4 ' , ' height ' : 360 , ' format_note ' : ' 3D ' , ' preference ' : - 20 } ,
' 83 ' : { ' ext ' : ' mp4 ' , ' height ' : 480 , ' format_note ' : ' 3D ' , ' preference ' : - 20 } ,
' 84 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' 3D ' , ' preference ' : - 20 } ,
' 85 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' 3D ' , ' preference ' : - 20 } ,
' 100 ' : { ' ext ' : ' webm ' , ' height ' : 360 , ' format_note ' : ' 3D ' , ' preference ' : - 20 } ,
' 101 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' 3D ' , ' preference ' : - 20 } ,
' 102 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' format_note ' : ' 3D ' , ' preference ' : - 20 } ,
2013-08-20 09:22:25 +08:00
2013-09-04 09:49:35 +08:00
# Apple HTTP Live Streaming
2014-03-22 21:22:39 +08:00
' 92 ' : { ' ext ' : ' mp4 ' , ' height ' : 240 , ' format_note ' : ' HLS ' , ' preference ' : - 10 } ,
' 93 ' : { ' ext ' : ' mp4 ' , ' height ' : 360 , ' format_note ' : ' HLS ' , ' preference ' : - 10 } ,
' 94 ' : { ' ext ' : ' mp4 ' , ' height ' : 480 , ' format_note ' : ' HLS ' , ' preference ' : - 10 } ,
' 95 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' HLS ' , ' preference ' : - 10 } ,
' 96 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' HLS ' , ' preference ' : - 10 } ,
' 132 ' : { ' ext ' : ' mp4 ' , ' height ' : 240 , ' format_note ' : ' HLS ' , ' preference ' : - 10 } ,
' 151 ' : { ' ext ' : ' mp4 ' , ' height ' : 72 , ' format_note ' : ' HLS ' , ' preference ' : - 10 } ,
2013-12-24 19:34:09 +08:00
# DASH mp4 video
2014-03-22 21:22:39 +08:00
' 133 ' : { ' ext ' : ' mp4 ' , ' height ' : 240 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 134 ' : { ' ext ' : ' mp4 ' , ' height ' : 360 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 135 ' : { ' ext ' : ' mp4 ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 136 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 137 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
2015-01-04 01:33:38 +08:00
' 138 ' : { ' ext ' : ' mp4 ' , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } , # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
2014-03-22 21:22:39 +08:00
' 160 ' : { ' ext ' : ' mp4 ' , ' height ' : 144 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 264 ' : { ' ext ' : ' mp4 ' , ' height ' : 1440 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
2014-10-31 18:13:02 +08:00
' 298 ' : { ' ext ' : ' mp4 ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' fps ' : 60 , ' vcodec ' : ' h264 ' } ,
' 299 ' : { ' ext ' : ' mp4 ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' fps ' : 60 , ' vcodec ' : ' h264 ' } ,
' 266 ' : { ' ext ' : ' mp4 ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' vcodec ' : ' h264 ' } ,
2013-08-20 09:22:25 +08:00
2013-10-19 05:53:00 +08:00
# Dash mp4 audio
2015-01-24 01:39:12 +08:00
' 139 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' vcodec ' : ' none ' , ' abr ' : 48 , ' preference ' : - 50 , ' container ' : ' m4a_dash ' } ,
' 140 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' vcodec ' : ' none ' , ' abr ' : 128 , ' preference ' : - 50 , ' container ' : ' m4a_dash ' } ,
' 141 ' : { ' ext ' : ' m4a ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' aac ' , ' vcodec ' : ' none ' , ' abr ' : 256 , ' preference ' : - 50 , ' container ' : ' m4a_dash ' } ,
2013-08-20 09:22:25 +08:00
# Dash webm
2014-04-29 05:18:59 +08:00
' 167 ' : { ' ext ' : ' webm ' , ' height ' : 360 , ' width ' : 640 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' container ' : ' webm ' , ' vcodec ' : ' VP8 ' , ' preference ' : - 40 } ,
' 168 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' width ' : 854 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' container ' : ' webm ' , ' vcodec ' : ' VP8 ' , ' preference ' : - 40 } ,
' 169 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' width ' : 1280 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' container ' : ' webm ' , ' vcodec ' : ' VP8 ' , ' preference ' : - 40 } ,
' 170 ' : { ' ext ' : ' webm ' , ' height ' : 1080 , ' width ' : 1920 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' container ' : ' webm ' , ' vcodec ' : ' VP8 ' , ' preference ' : - 40 } ,
' 218 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' width ' : 854 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' container ' : ' webm ' , ' vcodec ' : ' VP8 ' , ' preference ' : - 40 } ,
' 219 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' width ' : 854 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' container ' : ' webm ' , ' vcodec ' : ' VP8 ' , ' preference ' : - 40 } ,
2014-10-13 06:09:19 +08:00
' 278 ' : { ' ext ' : ' webm ' , ' height ' : 144 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' container ' : ' webm ' , ' vcodec ' : ' VP9 ' } ,
2014-04-29 05:18:59 +08:00
' 242 ' : { ' ext ' : ' webm ' , ' height ' : 240 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 243 ' : { ' ext ' : ' webm ' , ' height ' : 360 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 244 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 245 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 246 ' : { ' ext ' : ' webm ' , ' height ' : 480 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 247 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
' 248 ' : { ' ext ' : ' webm ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
2014-06-07 20:31:10 +08:00
' 271 ' : { ' ext ' : ' webm ' , ' height ' : 1440 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
2014-06-21 18:03:27 +08:00
' 272 ' : { ' ext ' : ' webm ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 } ,
2014-10-30 16:43:11 +08:00
' 302 ' : { ' ext ' : ' webm ' , ' height ' : 720 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' fps ' : 60 , ' vcodec ' : ' VP9 ' } ,
' 303 ' : { ' ext ' : ' webm ' , ' height ' : 1080 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' fps ' : 60 , ' vcodec ' : ' VP9 ' } ,
2015-01-06 18:59:41 +08:00
' 308 ' : { ' ext ' : ' webm ' , ' height ' : 1440 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' fps ' : 60 , ' vcodec ' : ' VP9 ' } ,
2014-12-01 01:56:14 +08:00
' 313 ' : { ' ext ' : ' webm ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' vcodec ' : ' VP9 ' } ,
2015-01-06 18:59:41 +08:00
' 315 ' : { ' ext ' : ' webm ' , ' height ' : 2160 , ' format_note ' : ' DASH video ' , ' acodec ' : ' none ' , ' preference ' : - 40 , ' fps ' : 60 , ' vcodec ' : ' VP9 ' } ,
2013-12-24 19:34:09 +08:00
# Dash webm audio
2014-08-21 19:13:26 +08:00
' 171 ' : { ' ext ' : ' webm ' , ' vcodec ' : ' none ' , ' format_note ' : ' DASH audio ' , ' abr ' : 128 , ' preference ' : - 50 } ,
2014-04-29 05:18:59 +08:00
' 172 ' : { ' ext ' : ' webm ' , ' vcodec ' : ' none ' , ' format_note ' : ' DASH audio ' , ' abr ' : 256 , ' preference ' : - 50 } ,
2014-01-09 09:38:50 +08:00
2014-11-18 18:06:09 +08:00
# Dash webm audio with opus inside
' 249 ' : { ' ext ' : ' webm ' , ' vcodec ' : ' none ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' opus ' , ' abr ' : 50 , ' preference ' : - 50 } ,
' 250 ' : { ' ext ' : ' webm ' , ' vcodec ' : ' none ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' opus ' , ' abr ' : 70 , ' preference ' : - 50 } ,
' 251 ' : { ' ext ' : ' webm ' , ' vcodec ' : ' none ' , ' format_note ' : ' DASH audio ' , ' acodec ' : ' opus ' , ' abr ' : 160 , ' preference ' : - 50 } ,
2014-01-09 09:38:50 +08:00
# RTMP (unnamed)
' _rtmp ' : { ' protocol ' : ' rtmp ' } ,
2013-06-24 01:58:33 +08:00
}
2013-08-20 09:22:25 +08:00
2014-09-13 13:51:06 +08:00
IE_NAME = ' youtube '
2013-06-28 01:13:11 +08:00
_TESTS = [
{
2014-09-24 15:49:53 +08:00
' url ' : ' http://www.youtube.com/watch?v=BaW_jenozKc ' ,
' info_dict ' : {
' id ' : ' BaW_jenozKc ' ,
' ext ' : ' mp4 ' ,
' title ' : ' youtube-dl test video " \' / \\ ä↭𝕐 ' ,
' uploader ' : ' Philipp Hagemeister ' ,
' uploader_id ' : ' phihag ' ,
' upload_date ' : ' 20121002 ' ,
' description ' : ' test chars: " \' / \\ ä↭𝕐 \n test URL: https://github.com/rg3/youtube-dl/issues/1892 \n \n This is a test video for youtube-dl. \n \n For more information, contact phihag@phihag.de . ' ,
' categories ' : [ ' Science & Technology ' ] ,
2014-09-01 00:10:05 +08:00
' like_count ' : int ,
' dislike_count ' : int ,
2013-06-28 01:13:11 +08:00
}
2013-06-28 01:55:39 +08:00
} ,
{
2014-09-24 15:49:53 +08:00
' url ' : ' http://www.youtube.com/watch?v=UxxajLWwzqY ' ,
' note ' : ' Test generic use_cipher_signature video (#897) ' ,
' info_dict ' : {
' id ' : ' UxxajLWwzqY ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20120506 ' ,
' title ' : ' Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO] ' ,
' description ' : ' md5:fea86fda2d5a5784273df5c7cc994d9f ' ,
' uploader ' : ' Icona Pop ' ,
' uploader_id ' : ' IconaPop ' ,
2013-06-28 01:13:11 +08:00
}
2013-07-09 20:38:24 +08:00
} ,
{
2014-09-24 15:49:53 +08:00
' url ' : ' https://www.youtube.com/watch?v=07FYdnEawAQ ' ,
' note ' : ' Test VEVO video with age protection (#956) ' ,
' info_dict ' : {
' id ' : ' 07FYdnEawAQ ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20130703 ' ,
' title ' : ' Justin Timberlake - Tunnel Vision (Explicit) ' ,
' description ' : ' md5:64249768eec3bc4276236606ea996373 ' ,
' uploader ' : ' justintimberlakeVEVO ' ,
' uploader_id ' : ' justintimberlakeVEVO ' ,
2013-07-09 20:38:24 +08:00
}
} ,
2013-11-18 20:05:18 +08:00
{
2014-09-24 15:49:53 +08:00
' url ' : ' //www.YouTube.com/watch?v=yZIXLfi8CZQ ' ,
' note ' : ' Embed-only video (#1746) ' ,
' info_dict ' : {
' id ' : ' yZIXLfi8CZQ ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20120608 ' ,
' title ' : ' Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012 ' ,
' description ' : ' md5:09b78bd971f1e3e289601dfba15ca4f7 ' ,
' uploader ' : ' SET India ' ,
' uploader_id ' : ' setindia '
2013-11-18 20:05:18 +08:00
}
} ,
2014-01-19 12:47:20 +08:00
{
2014-09-24 15:49:53 +08:00
' url ' : ' http://www.youtube.com/watch?v=a9LDPn-MO4I ' ,
' note ' : ' 256k DASH audio (format 141) via DASH manifest ' ,
' info_dict ' : {
' id ' : ' a9LDPn-MO4I ' ,
' ext ' : ' m4a ' ,
' upload_date ' : ' 20121002 ' ,
' uploader_id ' : ' 8KVIDEO ' ,
' description ' : ' ' ,
' uploader ' : ' 8KVIDEO ' ,
' title ' : ' UHDTV TEST 8K VIDEO.mp4 '
2014-01-23 04:56:37 +08:00
} ,
2014-09-24 15:49:53 +08:00
' params ' : {
' youtube_include_dash_manifest ' : True ,
' format ' : ' 141 ' ,
2014-01-23 04:56:37 +08:00
} ,
2014-01-19 12:47:20 +08:00
} ,
2014-02-21 22:15:58 +08:00
# DASH manifest with encrypted signature
{
2014-09-13 13:51:06 +08:00
' url ' : ' https://www.youtube.com/watch?v=IB3lcPjvWLA ' ,
' info_dict ' : {
' id ' : ' IB3lcPjvWLA ' ,
' ext ' : ' m4a ' ,
2014-12-01 02:18:39 +08:00
' title ' : ' Afrojack, Spree Wilson - The Spark ft. Spree Wilson ' ,
' description ' : ' md5:12e7067fa6735a77bdcbb58cb1187d2d ' ,
2014-09-13 13:51:06 +08:00
' uploader ' : ' AfrojackVEVO ' ,
' uploader_id ' : ' AfrojackVEVO ' ,
' upload_date ' : ' 20131011 ' ,
2014-02-21 22:15:58 +08:00
} ,
2014-09-24 15:49:53 +08:00
' params ' : {
2014-09-13 13:51:06 +08:00
' youtube_include_dash_manifest ' : True ,
' format ' : ' 141 ' ,
2014-02-21 22:15:58 +08:00
} ,
} ,
2015-01-16 03:25:03 +08:00
# JS player signature function name containing $
{
' url ' : ' https://www.youtube.com/watch?v=nfWlot6h_JM ' ,
' info_dict ' : {
' id ' : ' nfWlot6h_JM ' ,
' ext ' : ' m4a ' ,
' title ' : ' Taylor Swift - Shake It Off ' ,
' description ' : ' md5:2acfda1b285bdd478ccec22f9918199d ' ,
' uploader ' : ' TaylorSwiftVEVO ' ,
' uploader_id ' : ' TaylorSwiftVEVO ' ,
' upload_date ' : ' 20140818 ' ,
} ,
' params ' : {
' youtube_include_dash_manifest ' : True ,
' format ' : ' 141 ' ,
} ,
} ,
2014-11-23 16:59:02 +08:00
# Controversy video
{
' url ' : ' https://www.youtube.com/watch?v=T4XJQO3qol8 ' ,
' info_dict ' : {
' id ' : ' T4XJQO3qol8 ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20100909 ' ,
' uploader ' : ' The Amazing Atheist ' ,
' uploader_id ' : ' TheAmazingAtheist ' ,
' title ' : ' Burning Everyone \' s Koran ' ,
' description ' : ' SUBSCRIBE: http://www.youtube.com/saturninefilms \n \n Even Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html ' ,
}
2014-12-01 04:45:49 +08:00
} ,
# Normal age-gate video (No vevo, embed allowed)
{
' url ' : ' http://youtube.com/watch?v=HtVdAasjOgU ' ,
' info_dict ' : {
' id ' : ' HtVdAasjOgU ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer ' ,
2015-01-08 21:27:01 +08:00
' description ' : ' re:(?s). { 100,}About the Game \n .*?The Witcher 3: Wild Hunt. { 100,} ' ,
2014-12-01 04:45:49 +08:00
' uploader ' : ' The Witcher ' ,
' uploader_id ' : ' WitcherGame ' ,
' upload_date ' : ' 20140605 ' ,
} ,
} ,
2014-12-30 19:26:21 +08:00
# Age-gate video with encrypted signature
{
' url ' : ' http://www.youtube.com/watch?v=6kLq3WMV1nU ' ,
' info_dict ' : {
' id ' : ' 6kLq3WMV1nU ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Dedication To My Ex (Miss That) (Lyric Video) ' ,
' description ' : ' md5:33765bb339e1b47e7e72b5490139bb41 ' ,
' uploader ' : ' LloydVEVO ' ,
' uploader_id ' : ' LloydVEVO ' ,
' upload_date ' : ' 20110629 ' ,
} ,
} ,
2014-12-10 20:21:24 +08:00
# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
{
' url ' : ' __2ABJjxzNo ' ,
' info_dict ' : {
' id ' : ' __2ABJjxzNo ' ,
' ext ' : ' mp4 ' ,
' upload_date ' : ' 20100430 ' ,
' uploader_id ' : ' deadmau5 ' ,
' description ' : ' md5:12c56784b8032162bb936a5f76d55360 ' ,
' uploader ' : ' deadmau5 ' ,
' title ' : ' Deadmau5 - Some Chords (HD) ' ,
} ,
' expected_warnings ' : [
' DASH manifest missing ' ,
]
2014-12-11 23:28:07 +08:00
} ,
# Olympics (https://github.com/rg3/youtube-dl/issues/4431)
{
' url ' : ' lqQg6PlCWgI ' ,
' info_dict ' : {
' id ' : ' lqQg6PlCWgI ' ,
' ext ' : ' mp4 ' ,
2014-12-11 23:34:37 +08:00
' upload_date ' : ' 20120731 ' ,
' uploader_id ' : ' olympic ' ,
' description ' : ' HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games ' ,
' uploader ' : ' Olympics ' ,
' title ' : ' Hockey - Women - GER-AUS - London 2012 Olympic Games ' ,
} ,
' params ' : {
' skip_download ' : ' requires avconv ' ,
2014-12-11 23:28:07 +08:00
}
2014-12-11 23:34:37 +08:00
} ,
2015-01-10 12:45:51 +08:00
# Non-square pixels
{
' url ' : ' https://www.youtube.com/watch?v=_b-2C3KPAM0 ' ,
' info_dict ' : {
' id ' : ' _b-2C3KPAM0 ' ,
' ext ' : ' mp4 ' ,
' stretched_ratio ' : 16 / 9. ,
' upload_date ' : ' 20110310 ' ,
' uploader_id ' : ' AllenMeow ' ,
' description ' : ' made by Wacom from Korea | 字幕&加油添醋 by TY \' s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯 ' ,
' uploader ' : ' 孫艾倫 ' ,
' title ' : ' [A-made] 變態妍字幕版 太妍 我就是這樣的人 ' ,
} ,
2015-04-06 02:35:55 +08:00
} ,
# url_encoded_fmt_stream_map is empty string
{
' url ' : ' qEJwOuvDf7I ' ,
' info_dict ' : {
' id ' : ' qEJwOuvDf7I ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге ' ,
' description ' : ' ' ,
' upload_date ' : ' 20150404 ' ,
' uploader_id ' : ' spbelect ' ,
' uploader ' : ' Наблюдатели Петербурга ' ,
} ,
' params ' : {
' skip_download ' : ' requires avconv ' ,
}
} ,
2015-06-27 16:55:46 +08:00
# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
{
' url ' : ' https://www.youtube.com/watch?v=FIl7x6_3R5Y ' ,
' info_dict ' : {
' id ' : ' FIl7x6_3R5Y ' ,
' ext ' : ' mp4 ' ,
' title ' : ' md5:7b81415841e02ecd4313668cde88737a ' ,
' description ' : ' md5:116377fd2963b81ec4ce64b542173306 ' ,
' upload_date ' : ' 20150625 ' ,
' uploader_id ' : ' dorappi2000 ' ,
' uploader ' : ' dorappi2000 ' ,
' formats ' : ' mincount:33 ' ,
} ,
}
2013-06-28 01:13:11 +08:00
]
2013-09-21 20:19:30 +08:00
def __init__ ( self , * args , * * kwargs ) :
super ( YoutubeIE , self ) . __init__ ( * args , * * kwargs )
2013-09-21 21:19:48 +08:00
self . _player_cache = { }
2013-09-21 20:19:30 +08:00
2013-06-24 01:58:33 +08:00
def report_video_info_webpage_download ( self , video_id ) :
""" Report attempt to download video info webpage. """
2014-09-24 15:51:45 +08:00
self . to_screen ( ' %s : Downloading video info webpage ' % video_id )
2013-06-24 01:58:33 +08:00
def report_information_extraction ( self , video_id ) :
""" Report attempt to extract video information. """
2014-09-24 15:51:45 +08:00
self . to_screen ( ' %s : Extracting video information ' % video_id )
2013-06-24 01:58:33 +08:00
def report_unavailable_format ( self , video_id , format ) :
""" Report extracted video URL. """
2014-09-24 15:51:45 +08:00
self . to_screen ( ' %s : Format %s not available ' % ( video_id , format ) )
2013-06-24 01:58:33 +08:00
def report_rtmp_download ( self ) :
""" Indicate the download will use the RTMP protocol. """
2014-09-24 15:51:45 +08:00
self . to_screen ( ' RTMP download detected ' )
2013-06-24 01:58:33 +08:00
2014-08-02 18:21:53 +08:00
def _signature_cache_id ( self , example_sig ) :
""" Return a string representation of a signature """
2014-09-13 13:51:06 +08:00
return ' . ' . join ( compat_str ( len ( part ) ) for part in example_sig . split ( ' . ' ) )
2014-08-02 18:21:53 +08:00
def _extract_signature_function ( self , video_id , player_url , example_sig ) :
2014-07-17 22:28:30 +08:00
id_m = re . match (
2014-12-14 07:43:34 +08:00
r ' .*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)? \ .(?P<ext>[a-z]+)$ ' ,
2014-07-17 22:28:30 +08:00
player_url )
2014-07-23 08:19:33 +08:00
if not id_m :
raise ExtractorError ( ' Cannot identify player %r ' % player_url )
2013-09-21 20:19:30 +08:00
player_type = id_m . group ( ' ext ' )
player_id = id_m . group ( ' id ' )
2013-09-22 06:35:03 +08:00
# Read from filesystem cache
2014-08-02 18:21:53 +08:00
func_id = ' %s _ %s _ %s ' % (
player_type , player_id , self . _signature_cache_id ( example_sig ) )
2013-09-22 06:35:03 +08:00
assert os . path . basename ( func_id ) == func_id
2014-09-03 18:41:05 +08:00
2014-09-24 15:51:45 +08:00
cache_spec = self . _downloader . cache . load ( ' youtube-sigfuncs ' , func_id )
2014-09-03 18:41:05 +08:00
if cache_spec is not None :
2014-09-13 13:51:06 +08:00
return lambda s : ' ' . join ( s [ i ] for i in cache_spec )
2013-09-21 21:19:48 +08:00
2015-02-18 17:39:14 +08:00
download_note = (
' Downloading player %s ' % player_url
if self . _downloader . params . get ( ' verbose ' ) else
' Downloading %s player %s ' % ( player_type , player_id )
)
2013-09-21 20:19:30 +08:00
if player_type == ' js ' :
code = self . _download_webpage (
player_url , video_id ,
2015-02-18 17:39:14 +08:00
note = download_note ,
2014-09-24 15:51:45 +08:00
errnote = ' Download of %s failed ' % player_url )
2013-09-21 21:19:48 +08:00
res = self . _parse_sig_js ( code )
2013-09-22 06:35:03 +08:00
elif player_type == ' swf ' :
2013-09-21 20:19:30 +08:00
urlh = self . _request_webpage (
player_url , video_id ,
2015-02-18 17:39:14 +08:00
note = download_note ,
2014-09-24 15:51:45 +08:00
errnote = ' Download of %s failed ' % player_url )
2013-09-21 20:19:30 +08:00
code = urlh . read ( )
2013-09-21 21:19:48 +08:00
res = self . _parse_sig_swf ( code )
2013-09-21 20:19:30 +08:00
else :
assert False , ' Invalid player type %r ' % player_type
2015-02-18 17:42:23 +08:00
test_string = ' ' . join ( map ( compat_chr , range ( len ( example_sig ) ) ) )
cache_res = res ( test_string )
cache_spec = [ ord ( c ) for c in cache_res ]
2013-09-21 21:19:48 +08:00
2014-09-24 15:51:45 +08:00
self . _downloader . cache . store ( ' youtube-sigfuncs ' , func_id , cache_spec )
2013-09-21 21:19:48 +08:00
return res
2014-08-02 18:21:53 +08:00
def _print_sig_code ( self , func , example_sig ) :
2013-09-22 16:30:02 +08:00
def gen_sig_code ( idxs ) :
def _genslice ( start , end , step ) :
2014-09-13 13:51:06 +08:00
starts = ' ' if start == 0 else str ( start )
2014-11-24 04:20:46 +08:00
ends = ( ' : %d ' % ( end + step ) ) if end + step > = 0 else ' : '
2014-09-24 15:51:45 +08:00
steps = ' ' if step == 1 else ( ' : %d ' % step )
2014-09-13 13:51:06 +08:00
return ' s[ %s %s %s ] ' % ( starts , ends , steps )
2013-09-22 16:30:02 +08:00
step = None
2014-12-17 07:06:41 +08:00
# Quelch pyflakes warnings - start will be set when step is set
start = ' (Never used) '
2013-09-22 16:30:02 +08:00
for i , prev in zip ( idxs [ 1 : ] , idxs [ : - 1 ] ) :
if step is not None :
if i - prev == step :
continue
yield _genslice ( start , prev , step )
step = None
continue
if i - prev in [ - 1 , 1 ] :
step = i - prev
start = prev
continue
else :
2014-09-13 13:51:06 +08:00
yield ' s[ %d ] ' % prev
2013-09-22 16:30:02 +08:00
if step is None :
2014-09-13 13:51:06 +08:00
yield ' s[ %d ] ' % i
2013-09-22 16:30:02 +08:00
else :
yield _genslice ( start , i , step )
2014-09-13 13:51:06 +08:00
test_string = ' ' . join ( map ( compat_chr , range ( len ( example_sig ) ) ) )
2013-09-22 18:18:16 +08:00
cache_res = func ( test_string )
2013-09-22 16:30:02 +08:00
cache_spec = [ ord ( c ) for c in cache_res ]
2014-09-13 13:51:06 +08:00
expr_code = ' + ' . join ( gen_sig_code ( cache_spec ) )
2014-08-02 18:21:53 +08:00
signature_id_tuple = ' ( %s ) ' % (
' , ' . join ( compat_str ( len ( p ) ) for p in example_sig . split ( ' . ' ) ) )
2014-09-24 15:51:45 +08:00
code = ( ' if tuple(len(p) for p in s.split( \' . \' )) == %s : \n '
2014-09-13 13:51:06 +08:00
' return %s \n ' ) % ( signature_id_tuple , expr_code )
2014-09-24 15:51:45 +08:00
self . to_screen ( ' Extracted signature function: \n ' + code )
2013-09-22 16:30:02 +08:00
2013-09-21 20:19:30 +08:00
def _parse_sig_js ( self , jscode ) :
funcname = self . _search_regex (
2015-01-16 03:25:03 +08:00
r ' \ .sig \ | \ |([a-zA-Z0-9$]+) \ ( ' , jscode ,
2014-11-24 04:20:46 +08:00
' Initial JS player signature function name ' )
2014-03-30 13:02:58 +08:00
jsi = JSInterpreter ( jscode )
initial_function = jsi . extract_function ( funcname )
2013-09-21 20:19:30 +08:00
return lambda s : initial_function ( [ s ] )
def _parse_sig_swf ( self , file_contents ) :
2014-07-18 16:24:28 +08:00
swfi = SWFInterpreter ( file_contents )
2014-09-13 13:51:06 +08:00
TARGET_CLASSNAME = ' SignatureDecipher '
2014-07-18 16:24:28 +08:00
searched_class = swfi . extract_class ( TARGET_CLASSNAME )
2014-09-13 13:51:06 +08:00
initial_function = swfi . extract_function ( searched_class , ' decipher ' )
2013-09-21 20:19:30 +08:00
return lambda s : initial_function ( [ s ] )
2013-09-21 21:19:48 +08:00
def _decrypt_signature ( self , s , video_id , player_url , age_gate = False ) :
2013-06-28 04:20:50 +08:00
""" Turn the encrypted s field into a working signature """
2013-06-27 07:51:10 +08:00
2014-07-11 16:44:39 +08:00
if player_url is None :
2014-09-24 15:51:45 +08:00
raise ExtractorError ( ' Cannot decrypt signature without player_url ' )
2013-09-27 12:15:21 +08:00
2014-09-24 15:51:45 +08:00
if player_url . startswith ( ' // ' ) :
2014-09-13 13:51:06 +08:00
player_url = ' https: ' + player_url
2014-07-11 16:44:39 +08:00
try :
2014-08-02 18:23:18 +08:00
player_id = ( player_url , self . _signature_cache_id ( s ) )
2014-07-11 16:44:39 +08:00
if player_id not in self . _player_cache :
func = self . _extract_signature_function (
2014-08-02 18:21:53 +08:00
video_id , player_url , s
2014-07-11 16:44:39 +08:00
)
self . _player_cache [ player_id ] = func
func = self . _player_cache [ player_id ]
if self . _downloader . params . get ( ' youtube_print_sig_code ' ) :
2014-08-02 18:21:53 +08:00
self . _print_sig_code ( func , s )
2014-07-11 16:44:39 +08:00
return func ( s )
except Exception as e :
tb = traceback . format_exc ( )
raise ExtractorError (
2014-09-13 13:51:06 +08:00
' Signature extraction failed: ' + tb , cause = e )
2013-09-21 20:19:30 +08:00
2015-02-17 04:44:17 +08:00
def _get_subtitles ( self , video_id , webpage ) :
2013-09-11 21:48:23 +08:00
try :
2014-12-31 22:44:15 +08:00
subs_doc = self . _download_xml (
2014-01-26 00:11:55 +08:00
' https://video.google.com/timedtext?hl=en&type=list&v= %s ' % video_id ,
2013-09-11 22:24:47 +08:00
video_id , note = False )
except ExtractorError as err :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' unable to download video subtitles: %s ' % compat_str ( err ) )
2013-09-11 21:48:23 +08:00
return { }
sub_lang_list = { }
2014-12-31 22:44:15 +08:00
for track in subs_doc . findall ( ' track ' ) :
lang = track . attrib [ ' lang_code ' ]
2014-07-23 10:56:09 +08:00
if lang in sub_lang_list :
continue
2015-02-17 04:44:17 +08:00
sub_formats = [ ]
for ext in [ ' sbv ' , ' vtt ' , ' srt ' ] :
params = compat_urllib_parse . urlencode ( {
' lang ' : lang ,
' v ' : video_id ,
' fmt ' : ext ,
' name ' : track . attrib [ ' name ' ] . encode ( ' utf-8 ' ) ,
} )
sub_formats . append ( {
' url ' : ' https://www.youtube.com/api/timedtext? ' + params ,
' ext ' : ext ,
} )
sub_lang_list [ lang ] = sub_formats
2013-09-11 21:48:23 +08:00
if not sub_lang_list :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' video doesn \' t have subtitles ' )
2013-09-11 21:48:23 +08:00
return { }
return sub_lang_list
2015-02-17 04:44:17 +08:00
def _get_automatic_captions ( self , video_id , webpage ) :
2013-09-11 21:48:23 +08:00
""" We need the webpage for getting the captions url, pass it as an
argument to speed up the process . """
2014-09-24 15:51:45 +08:00
self . to_screen ( ' %s : Looking for automatic captions ' % video_id )
2013-09-11 21:48:23 +08:00
mobj = re . search ( r ' ;ytplayer.config = ( { .*?}); ' , webpage )
2014-09-13 13:51:06 +08:00
err_msg = ' Couldn \' t find automatic captions for %s ' % video_id
2013-09-11 21:48:23 +08:00
if mobj is None :
self . _downloader . report_warning ( err_msg )
return { }
player_config = json . loads ( mobj . group ( 1 ) )
try :
2014-11-26 19:41:53 +08:00
args = player_config [ ' args ' ]
caption_url = args [ ' ttsurl ' ]
timestamp = args [ ' timestamp ' ]
2013-09-12 01:02:01 +08:00
# We get the available subtitles
list_params = compat_urllib_parse . urlencode ( {
' type ' : ' list ' ,
' tlangs ' : 1 ,
' asrs ' : 1 ,
2013-09-11 21:48:23 +08:00
} )
2013-09-12 01:02:01 +08:00
list_url = caption_url + ' & ' + list_params
2013-11-27 01:48:52 +08:00
caption_list = self . _download_xml ( list_url , video_id )
2013-09-12 01:24:56 +08:00
original_lang_node = caption_list . find ( ' track ' )
2014-12-31 21:15:16 +08:00
if original_lang_node is None :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' Video doesn \' t have automatic captions ' )
2013-09-12 01:24:56 +08:00
return { }
original_lang = original_lang_node . attrib [ ' lang_code ' ]
2014-12-31 21:15:16 +08:00
caption_kind = original_lang_node . attrib . get ( ' kind ' , ' ' )
2013-09-12 01:02:01 +08:00
sub_lang_list = { }
for lang_node in caption_list . findall ( ' target ' ) :
sub_lang = lang_node . attrib [ ' lang_code ' ]
2015-02-17 04:44:17 +08:00
sub_formats = [ ]
for ext in [ ' sbv ' , ' vtt ' , ' srt ' ] :
params = compat_urllib_parse . urlencode ( {
' lang ' : original_lang ,
' tlang ' : sub_lang ,
' fmt ' : ext ,
' ts ' : timestamp ,
' kind ' : caption_kind ,
} )
sub_formats . append ( {
' url ' : caption_url + ' & ' + params ,
' ext ' : ext ,
} )
sub_lang_list [ sub_lang ] = sub_formats
2013-09-12 01:02:01 +08:00
return sub_lang_list
2013-09-11 21:48:23 +08:00
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
except ( KeyError , ExtractorError ) :
self . _downloader . report_warning ( err_msg )
return { }
2014-02-09 02:20:11 +08:00
@classmethod
def extract_id ( cls , url ) :
mobj = re . match ( cls . _VALID_URL , url , re . VERBOSE )
2013-06-24 01:58:33 +08:00
if mobj is None :
2014-09-24 15:51:45 +08:00
raise ExtractorError ( ' Invalid URL: %s ' % url )
2013-06-24 01:58:33 +08:00
video_id = mobj . group ( 2 )
return video_id
2013-07-20 18:46:02 +08:00
def _extract_from_m3u8 ( self , manifest_url , video_id ) :
url_map = { }
2014-11-24 03:41:03 +08:00
2013-07-20 18:46:02 +08:00
def _get_urls ( _manifest ) :
lines = _manifest . split ( ' \n ' )
urls = filter ( lambda l : l and not l . startswith ( ' # ' ) ,
2014-11-24 04:20:46 +08:00
lines )
2013-07-20 18:46:02 +08:00
return urls
2014-09-13 13:51:06 +08:00
manifest = self . _download_webpage ( manifest_url , video_id , ' Downloading formats manifest ' )
2013-07-20 18:46:02 +08:00
formats_urls = _get_urls ( manifest )
for format_url in formats_urls :
2013-09-09 00:49:10 +08:00
itag = self . _search_regex ( r ' itag/( \ d+?)/ ' , format_url , ' itag ' )
2013-07-20 18:46:02 +08:00
url_map [ itag ] = format_url
return url_map
2013-10-14 13:18:58 +08:00
def _extract_annotations ( self , video_id ) :
url = ' https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id= %s ' % video_id
2014-09-24 15:51:45 +08:00
return self . _download_webpage ( url , video_id , note = ' Searching for annotations. ' , errnote = ' Unable to download video annotations. ' )
2013-10-14 13:18:58 +08:00
2014-12-10 21:39:00 +08:00
def _parse_dash_manifest (
self , video_id , dash_manifest_url , player_url , age_gate ) :
2014-12-10 20:21:24 +08:00
def decrypt_sig ( mobj ) :
s = mobj . group ( 1 )
dec_s = self . _decrypt_signature ( s , video_id , player_url , age_gate )
return ' /signature/ %s ' % dec_s
2015-06-09 14:48:18 +08:00
dash_manifest_url = re . sub ( r ' /s/([a-fA-F0-9 \ .]+) ' , decrypt_sig , dash_manifest_url )
2014-12-10 20:21:24 +08:00
dash_doc = self . _download_xml (
dash_manifest_url , video_id ,
note = ' Downloading DASH manifest ' ,
errnote = ' Could not download DASH manifest ' )
formats = [ ]
2015-04-08 03:45:02 +08:00
for a in dash_doc . findall ( ' .// { urn:mpeg:DASH:schema:MPD:2011}AdaptationSet ' ) :
mime_type = a . attrib . get ( ' mimeType ' )
for r in a . findall ( ' { urn:mpeg:DASH:schema:MPD:2011}Representation ' ) :
url_el = r . find ( ' { urn:mpeg:DASH:schema:MPD:2011}BaseURL ' )
if url_el is None :
continue
if mime_type == ' text/vtt ' :
# TODO implement WebVTT downloading
pass
elif mime_type . startswith ( ' audio/ ' ) or mime_type . startswith ( ' video/ ' ) :
format_id = r . attrib [ ' id ' ]
video_url = url_el . text
filesize = int_or_none ( url_el . attrib . get ( ' { http://youtube.com/yt/2012/10/10}contentLength ' ) )
f = {
' format_id ' : format_id ,
' url ' : video_url ,
' width ' : int_or_none ( r . attrib . get ( ' width ' ) ) ,
' height ' : int_or_none ( r . attrib . get ( ' height ' ) ) ,
' tbr ' : int_or_none ( r . attrib . get ( ' bandwidth ' ) , 1000 ) ,
' asr ' : int_or_none ( r . attrib . get ( ' audioSamplingRate ' ) ) ,
' filesize ' : filesize ,
' fps ' : int_or_none ( r . attrib . get ( ' frameRate ' ) ) ,
}
try :
existing_format = next (
fo for fo in formats
if fo [ ' format_id ' ] == format_id )
except StopIteration :
full_info = self . _formats . get ( format_id , { } ) . copy ( )
full_info . update ( f )
2015-06-27 02:41:26 +08:00
codecs = r . attrib . get ( ' codecs ' )
if codecs :
if full_info . get ( ' acodec ' ) == ' none ' and ' vcodec ' not in full_info :
full_info [ ' vcodec ' ] = codecs
elif full_info . get ( ' vcodec ' ) == ' none ' and ' acodec ' not in full_info :
full_info [ ' acodec ' ] = codecs
2015-04-08 03:45:02 +08:00
formats . append ( full_info )
else :
existing_format . update ( f )
else :
self . report_warning ( ' Unknown MIME type %s in DASH manifest ' % mime_type )
2014-12-10 20:21:24 +08:00
return formats
2013-06-24 01:58:33 +08:00
def _real_extract ( self , url ) :
2014-03-21 07:33:53 +08:00
proto = (
2014-09-13 13:51:06 +08:00
' http ' if self . _downloader . params . get ( ' prefer_insecure ' , False )
else ' https ' )
2014-03-21 07:33:53 +08:00
2013-06-24 01:58:33 +08:00
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re . search ( self . _NEXT_URL_RE , url )
if mobj :
2014-03-21 07:33:53 +08:00
url = proto + ' ://www.youtube.com/ ' + compat_urllib_parse . unquote ( mobj . group ( 1 ) ) . lstrip ( ' / ' )
2014-02-09 02:20:11 +08:00
video_id = self . extract_id ( url )
2013-06-24 01:58:33 +08:00
# Get video webpage
2014-11-23 16:59:02 +08:00
url = proto + ' ://www.youtube.com/watch?v= %s &gl=US&hl=en&has_verified=1&bpctr=9999999999 ' % video_id
2014-09-29 08:04:16 +08:00
video_webpage = self . _download_webpage ( url , video_id )
2013-06-24 01:58:33 +08:00
# Attempt to extract SWF player URL
2013-09-21 20:19:30 +08:00
mobj = re . search ( r ' swfConfig.*? " (https?: \\ / \\ /.*?watch.*?-.*? \ .swf) " ' , video_webpage )
2013-06-24 01:58:33 +08:00
if mobj is not None :
player_url = re . sub ( r ' \\ (.) ' , r ' \ 1 ' , mobj . group ( 1 ) )
else :
player_url = None
2015-06-27 02:36:23 +08:00
dash_mpds = [ ]
def add_dash_mpd ( video_info ) :
dash_mpd = video_info . get ( ' dashmpd ' )
if dash_mpd and dash_mpd [ 0 ] not in dash_mpds :
dash_mpds . append ( dash_mpd [ 0 ] )
2013-06-24 01:58:33 +08:00
# Get video info
2015-01-30 11:43:50 +08:00
embed_webpage = None
2013-07-09 20:38:24 +08:00
if re . search ( r ' player-age-gate-content " > ' , video_webpage ) is not None :
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
2014-12-30 05:58:14 +08:00
url = proto + ' ://www.youtube.com/embed/ %s ' % video_id
embed_webpage = self . _download_webpage ( url , video_id , ' Downloading embed webpage ' )
2014-07-21 03:05:02 +08:00
data = compat_urllib_parse . urlencode ( {
' video_id ' : video_id ,
' eurl ' : ' https://youtube.googleapis.com/v/ ' + video_id ,
2014-07-23 18:16:26 +08:00
' sts ' : self . _search_regex (
2014-12-30 05:58:14 +08:00
r ' " sts " \ s*: \ s*( \ d+) ' , embed_webpage , ' sts ' , default = ' ' ) ,
2014-07-21 03:05:02 +08:00
} )
2014-03-21 07:33:53 +08:00
video_info_url = proto + ' ://www.youtube.com/get_video_info? ' + data
2014-11-05 05:45:43 +08:00
video_info_webpage = self . _download_webpage (
video_info_url , video_id ,
2014-11-05 06:35:34 +08:00
note = ' Refetching age-gated info webpage ' ,
2014-11-05 05:45:43 +08:00
errnote = ' unable to download video info webpage ' )
2013-06-24 01:58:33 +08:00
video_info = compat_parse_qs ( video_info_webpage )
2015-06-27 02:36:23 +08:00
add_dash_mpd ( video_info )
2013-07-09 20:38:24 +08:00
else :
age_gate = False
2015-06-27 15:19:46 +08:00
video_info = None
2015-06-27 02:36:23 +08:00
# Try looking directly into the video webpage
mobj = re . search ( r ' ;ytplayer \ .config \ s*= \ s*( { .*?}); ' , video_webpage )
if mobj :
2014-12-01 03:56:32 +08:00
json_code = uppercase_escape ( mobj . group ( 1 ) )
ytplayer_config = json . loads ( json_code )
args = ytplayer_config [ ' args ' ]
2015-06-27 02:36:23 +08:00
if args . get ( ' url_encoded_fmt_stream_map ' ) :
# Convert to the same format returned by compat_parse_qs
video_info = dict ( ( k , [ v ] ) for k , v in args . items ( ) )
add_dash_mpd ( video_info )
2015-06-27 16:31:18 +08:00
if not video_info or self . _downloader . params . get ( ' youtube_include_dash_manifest ' , True ) :
# We also try looking in get_video_info since it may contain different dashmpd
# URL that points to a DASH manifest with possibly different itag set (some itags
# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
# manifest pointed by get_video_info's dashmpd).
# The general idea is to take a union of itags of both DASH manifests (for example
# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
2014-12-01 03:56:32 +08:00
self . report_video_info_webpage_download ( video_id )
2015-06-27 16:31:18 +08:00
for el_type in [ ' &el=info ' , ' &el=embedded ' , ' &el=detailpage ' , ' &el=vevo ' , ' ' ] :
2014-12-04 15:27:40 +08:00
video_info_url = (
' %s ://www.youtube.com/get_video_info?&video_id= %s %s &ps=default&eurl=&gl=US&hl=en '
% ( proto , video_id , el_type ) )
video_info_webpage = self . _download_webpage (
video_info_url ,
2014-12-01 03:56:32 +08:00
video_id , note = False ,
errnote = ' unable to download video info webpage ' )
2015-06-27 16:31:18 +08:00
get_video_info = compat_parse_qs ( video_info_webpage )
add_dash_mpd ( get_video_info )
if not video_info :
video_info = get_video_info
if ' token ' in get_video_info :
2014-12-01 03:56:32 +08:00
break
2013-06-24 01:58:33 +08:00
if ' token ' not in video_info :
if ' reason ' in video_info :
2015-06-27 13:15:57 +08:00
if ' The uploader has not made this video available in your country. ' in video_info [ ' reason ' ] :
regions_allowed = self . _html_search_meta ( ' regionsAllowed ' , video_webpage , default = None )
if regions_allowed is not None :
raise ExtractorError ( ' YouTube said: This video is available in %s only ' % (
' , ' . join ( map ( ISO3166Utils . short2full , regions_allowed . split ( ' , ' ) ) ) ) ,
expected = True )
2014-04-22 02:34:03 +08:00
raise ExtractorError (
2014-09-13 13:51:06 +08:00
' YouTube said: %s ' % video_info [ ' reason ' ] [ 0 ] ,
2014-04-22 02:34:03 +08:00
expected = True , video_id = video_id )
2013-06-24 01:58:33 +08:00
else :
2014-04-22 02:34:03 +08:00
raise ExtractorError (
2014-09-13 13:51:06 +08:00
' " token " parameter not in video info for unknown reason ' ,
2014-04-22 02:34:03 +08:00
video_id = video_id )
2013-06-24 01:58:33 +08:00
2013-11-17 18:06:16 +08:00
if ' view_count ' in video_info :
view_count = int ( video_info [ ' view_count ' ] [ 0 ] )
else :
view_count = None
2013-06-24 01:58:33 +08:00
# Check for "rental" videos
if ' ypc_video_rental_bar_text ' in video_info and ' author ' not in video_info :
2014-09-24 15:51:45 +08:00
raise ExtractorError ( ' " rental " videos not supported ' )
2013-06-24 01:58:33 +08:00
# Start extracting information
self . report_information_extraction ( video_id )
# uploader
if ' author ' not in video_info :
2014-09-24 15:51:45 +08:00
raise ExtractorError ( ' Unable to extract uploader name ' )
2013-06-24 01:58:33 +08:00
video_uploader = compat_urllib_parse . unquote_plus ( video_info [ ' author ' ] [ 0 ] )
# uploader_id
video_uploader_id = None
mobj = re . search ( r ' <link itemprop= " url " href= " http://www.youtube.com/(?:user|channel)/([^ " ]+) " > ' , video_webpage )
if mobj is not None :
video_uploader_id = mobj . group ( 1 )
else :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' unable to extract uploader nickname ' )
2013-06-24 01:58:33 +08:00
# title
2013-10-02 13:25:35 +08:00
if ' title ' in video_info :
2014-04-25 19:19:03 +08:00
video_title = video_info [ ' title ' ] [ 0 ]
2013-10-02 13:25:35 +08:00
else :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' Unable to extract video title ' )
2014-09-13 13:51:06 +08:00
video_title = ' _ '
2013-06-24 01:58:33 +08:00
# thumbnail image
2013-07-08 03:21:15 +08:00
# We try first to get a high quality image:
m_thumb = re . search ( r ' <span itemprop= " thumbnail " .*?href= " (.*?) " > ' ,
video_webpage , re . DOTALL )
if m_thumb is not None :
video_thumbnail = m_thumb . group ( 1 )
elif ' thumbnail_url ' not in video_info :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' unable to extract video thumbnail ' )
2013-09-27 20:22:36 +08:00
video_thumbnail = None
2013-06-24 01:58:33 +08:00
else : # don't panic if we can't find it
video_thumbnail = compat_urllib_parse . unquote_plus ( video_info [ ' thumbnail_url ' ] [ 0 ] )
# upload date
upload_date = None
2014-05-15 18:41:42 +08:00
mobj = re . search ( r ' (?s)id= " eow-date.*?>(.*?)</span> ' , video_webpage )
2014-05-16 19:21:24 +08:00
if mobj is None :
mobj = re . search (
2015-06-30 03:02:48 +08:00
r ' id= " watch-uploader-info " .*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.*?)</strong> ' ,
2014-05-16 19:21:24 +08:00
video_webpage )
2013-06-24 01:58:33 +08:00
if mobj is not None :
upload_date = ' ' . join ( re . sub ( r ' [/,-] ' , r ' ' , mobj . group ( 1 ) ) . split ( ) )
upload_date = unified_strdate ( upload_date )
2014-09-01 05:26:19 +08:00
m_cat_container = self . _search_regex (
r ' (?s)<h4[^>]*> \ s*Category \ s*</h4> \ s*<ul[^>]*>(.*?)</ul> ' ,
2014-12-11 23:32:33 +08:00
video_webpage , ' categories ' , default = None )
2014-05-15 14:59:27 +08:00
if m_cat_container :
2014-05-15 18:41:42 +08:00
category = self . _html_search_regex (
2014-05-15 19:43:29 +08:00
r ' (?s)<a[^<]+>(.*?)</a> ' , m_cat_container , ' category ' ,
2014-05-15 18:41:42 +08:00
default = None )
video_categories = None if category is None else [ category ]
else :
video_categories = None
2014-05-15 14:59:27 +08:00
2013-06-24 01:58:33 +08:00
# description
video_description = get_element_by_id ( " eow-description " , video_webpage )
if video_description :
2013-12-04 21:16:52 +08:00
video_description = re . sub ( r ''' (?x)
< a \s +
( ? : [ a - zA - Z - ] + = " [^ " ] + " \ s+)*?
title = " ([^ " ] + ) " \ s+
( ? : [ a - zA - Z - ] + = " [^ " ] + " \ s+)*?
class = " yt-uix-redirect-link " \s * >
[ ^ < ] +
< / a >
''' , r ' \1 ' , video_description)
2013-06-24 01:58:33 +08:00
video_description = clean_html ( video_description )
else :
fd_mobj = re . search ( r ' <meta name= " description " content= " ([^ " ]+) " ' , video_webpage )
if fd_mobj :
video_description = unescapeHTML ( fd_mobj . group ( 1 ) )
else :
2014-09-13 13:51:06 +08:00
video_description = ' '
2013-06-24 01:58:33 +08:00
2014-08-31 18:38:00 +08:00
def _extract_count ( count_name ) :
2015-06-29 02:48:06 +08:00
return str_to_int ( self . _search_regex (
r ' - %s -button[^>]+><span[^>]+class= " yt-uix-button-content " [^>]*>([ \ d,]+)</span> '
% re . escape ( count_name ) ,
video_webpage , count_name , default = None ) )
2014-09-24 15:51:45 +08:00
like_count = _extract_count ( ' like ' )
dislike_count = _extract_count ( ' dislike ' )
2013-12-06 20:22:04 +08:00
2013-06-24 01:58:33 +08:00
# subtitles
2013-09-11 22:05:49 +08:00
video_subtitles = self . extract_subtitles ( video_id , video_webpage )
2015-02-17 04:44:17 +08:00
automatic_captions = self . extract_automatic_captions ( video_id , video_webpage )
2013-06-24 01:58:33 +08:00
if ' length_seconds ' not in video_info :
2014-09-24 15:51:45 +08:00
self . _downloader . report_warning ( ' unable to extract video duration ' )
2013-12-16 11:09:05 +08:00
video_duration = None
2013-06-24 01:58:33 +08:00
else :
2013-12-16 11:09:05 +08:00
video_duration = int ( compat_urllib_parse . unquote_plus ( video_info [ ' length_seconds ' ] [ 0 ] ) )
2013-06-24 01:58:33 +08:00
2013-10-14 13:18:58 +08:00
# annotations
video_annotations = None
if self . _downloader . params . get ( ' writeannotations ' , False ) :
2014-11-24 03:41:03 +08:00
video_annotations = self . _extract_annotations ( video_id )
2013-10-14 13:18:58 +08:00
2014-01-19 12:47:20 +08:00
def _map_to_format_list ( urlmap ) :
formats = [ ]
for itag , video_real_url in urlmap . items ( ) :
dct = {
' format_id ' : itag ,
' url ' : video_real_url ,
' player_url ' : player_url ,
}
2014-01-24 06:21:42 +08:00
if itag in self . _formats :
dct . update ( self . _formats [ itag ] )
2014-01-19 12:47:20 +08:00
formats . append ( dct )
return formats
2013-06-24 01:58:33 +08:00
if ' conn ' in video_info and video_info [ ' conn ' ] [ 0 ] . startswith ( ' rtmp ' ) :
self . report_rtmp_download ( )
2014-01-19 12:47:20 +08:00
formats = [ {
' format_id ' : ' _rtmp ' ,
' protocol ' : ' rtmp ' ,
' url ' : video_info [ ' conn ' ] [ 0 ] ,
' player_url ' : player_url ,
} ]
2014-12-11 04:51:05 +08:00
elif len ( video_info . get ( ' url_encoded_fmt_stream_map ' , [ ' ' ] ) [ 0 ] ) > = 1 or len ( video_info . get ( ' adaptive_fmts ' , [ ' ' ] ) [ 0 ] ) > = 1 :
2014-11-24 03:41:03 +08:00
encoded_url_map = video_info . get ( ' url_encoded_fmt_stream_map ' , [ ' ' ] ) [ 0 ] + ' , ' + video_info . get ( ' adaptive_fmts ' , [ ' ' ] ) [ 0 ]
2013-10-25 22:52:58 +08:00
if ' rtmpe % 3Dyes ' in encoded_url_map :
2013-07-10 20:35:11 +08:00
raise ExtractorError ( ' rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information. ' , expected = True )
2013-06-24 01:58:33 +08:00
url_map = { }
2013-10-25 22:52:58 +08:00
for url_data_str in encoded_url_map . split ( ' , ' ) :
2013-06-24 01:58:33 +08:00
url_data = compat_parse_qs ( url_data_str )
2014-08-02 12:35:18 +08:00
if ' itag ' not in url_data or ' url ' not in url_data :
continue
format_id = url_data [ ' itag ' ] [ 0 ]
url = url_data [ ' url ' ] [ 0 ]
if ' sig ' in url_data :
url + = ' &signature= ' + url_data [ ' sig ' ] [ 0 ]
elif ' s ' in url_data :
encrypted_sig = url_data [ ' s ' ] [ 0 ]
2015-01-30 11:43:50 +08:00
ASSETS_RE = r ' " assets " :.+? " js " : \ s*( " [^ " ]+ " ) '
2014-08-02 12:35:18 +08:00
2014-12-30 05:58:14 +08:00
jsplayer_url_json = self . _search_regex (
2015-01-30 11:43:50 +08:00
ASSETS_RE ,
embed_webpage if age_gate else video_webpage ,
' JS player URL (1) ' , default = None )
if not jsplayer_url_json and not age_gate :
# We need the embed website after all
if embed_webpage is None :
embed_url = proto + ' ://www.youtube.com/embed/ %s ' % video_id
embed_webpage = self . _download_webpage (
embed_url , video_id , ' Downloading embed webpage ' )
jsplayer_url_json = self . _search_regex (
ASSETS_RE , embed_webpage , ' JS player URL ' )
2014-12-30 05:58:14 +08:00
player_url = json . loads ( jsplayer_url_json )
2014-08-02 12:35:18 +08:00
if player_url is None :
player_url_json = self . _search_regex (
r ' ytplayer \ .config.*? " url " \ s*: \ s*( " [^ " ]+ " ) ' ,
2014-09-13 13:51:06 +08:00
video_webpage , ' age gate player URL ' )
2014-08-02 12:35:18 +08:00
player_url = json . loads ( player_url_json )
if self . _downloader . params . get ( ' verbose ' ) :
2014-07-17 22:28:30 +08:00
if player_url is None :
2014-08-02 12:35:18 +08:00
player_version = ' unknown '
player_desc = ' unknown '
else :
if player_url . endswith ( ' swf ' ) :
player_version = self . _search_regex (
r ' -(.+?)(?:/watch_as3)? \ .swf$ ' , player_url ,
2014-09-13 13:51:06 +08:00
' flash player ' , fatal = False )
2014-08-02 12:35:18 +08:00
player_desc = ' flash player %s ' % player_version
2014-07-17 22:28:30 +08:00
else :
2014-08-02 12:35:18 +08:00
player_version = self . _search_regex (
r ' html5player-([^/]+?)(?:/html5player)? \ .js ' ,
player_url ,
' html5 player ' , fatal = False )
2014-09-13 13:51:06 +08:00
player_desc = ' html5 player %s ' % player_version
2014-08-02 12:35:18 +08:00
2014-08-02 18:21:53 +08:00
parts_sizes = self . _signature_cache_id ( encrypted_sig )
2014-09-24 15:51:45 +08:00
self . to_screen ( ' { %s } signature length %s , %s ' %
2014-11-24 04:39:15 +08:00
( format_id , parts_sizes , player_desc ) )
2014-08-02 12:35:18 +08:00
signature = self . _decrypt_signature (
encrypted_sig , video_id , player_url , age_gate )
url + = ' &signature= ' + signature
if ' ratebypass ' not in url :
url + = ' &ratebypass=yes '
url_map [ format_id ] = url
2014-01-19 12:47:20 +08:00
formats = _map_to_format_list ( url_map )
2013-07-20 18:46:02 +08:00
elif video_info . get ( ' hlsvp ' ) :
manifest_url = video_info [ ' hlsvp ' ] [ 0 ]
url_map = self . _extract_from_m3u8 ( manifest_url , video_id )
2014-01-19 12:47:20 +08:00
formats = _map_to_format_list ( url_map )
2013-06-24 01:58:33 +08:00
else :
2014-09-24 15:51:45 +08:00
raise ExtractorError ( ' no conn, hlsvp or url_encoded_fmt_stream_map information found in video info ' )
2013-06-24 01:58:33 +08:00
2014-01-19 12:47:20 +08:00
# Look for the DASH manifest
2014-10-13 06:03:08 +08:00
if self . _downloader . params . get ( ' youtube_include_dash_manifest ' , True ) :
2015-06-27 02:36:23 +08:00
for dash_manifest_url in dash_mpds :
dash_formats = { }
2014-12-10 20:21:24 +08:00
try :
2015-06-27 02:36:23 +08:00
for df in self . _parse_dash_manifest (
video_id , dash_manifest_url , player_url , age_gate ) :
# Do not overwrite DASH format found in some previous DASH manifest
if df [ ' format_id ' ] not in dash_formats :
dash_formats [ df [ ' format_id ' ] ] = df
2014-12-10 20:21:24 +08:00
except ( ExtractorError , KeyError ) as e :
self . report_warning (
' Skipping DASH manifest: %r ' % e , video_id )
2015-06-27 02:36:23 +08:00
if dash_formats :
2015-05-22 17:58:52 +08:00
# Remove the formats we found through non-DASH, they
# contain less info and it can be wrong, because we use
# fixed values (for example the resolution). See
# https://github.com/rg3/youtube-dl/issues/5774 for an
# example.
2015-06-27 04:48:50 +08:00
formats = [ f for f in formats if f [ ' format_id ' ] not in dash_formats . keys ( ) ]
2015-06-27 02:36:23 +08:00
formats . extend ( dash_formats . values ( ) )
2013-12-23 11:51:42 +08:00
2015-01-10 12:45:51 +08:00
# Check for malformed aspect ratio
stretched_m = re . search (
r ' <meta \ s+property= " og:video:tag " .*?content= " yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+) " > ' ,
video_webpage )
if stretched_m :
ratio = float ( stretched_m . group ( ' w ' ) ) / float ( stretched_m . group ( ' h ' ) )
for f in formats :
if f . get ( ' vcodec ' ) != ' none ' :
f [ ' stretched_ratio ' ] = ratio
2013-12-24 19:25:22 +08:00
self . _sort_formats ( formats )
2013-12-18 10:30:55 +08:00
return {
2014-11-24 04:20:46 +08:00
' id ' : video_id ,
' uploader ' : video_uploader ,
' uploader_id ' : video_uploader_id ,
' upload_date ' : upload_date ,
' title ' : video_title ,
' thumbnail ' : video_thumbnail ,
' description ' : video_description ,
' categories ' : video_categories ,
' subtitles ' : video_subtitles ,
2015-02-17 04:44:17 +08:00
' automatic_captions ' : automatic_captions ,
2014-11-24 04:20:46 +08:00
' duration ' : video_duration ,
' age_limit ' : 18 if age_gate else 0 ,
' annotations ' : video_annotations ,
2014-03-21 07:33:53 +08:00
' webpage_url ' : proto + ' ://www.youtube.com/watch?v= %s ' % video_id ,
2014-11-24 04:20:46 +08:00
' view_count ' : view_count ,
2013-12-18 10:30:55 +08:00
' like_count ' : like_count ,
' dislike_count ' : dislike_count ,
2015-02-12 01:39:31 +08:00
' average_rating ' : float_or_none ( video_info . get ( ' avg_rating ' , [ None ] ) [ 0 ] ) ,
2014-11-24 04:20:46 +08:00
' formats ' : formats ,
2013-12-18 10:30:55 +08:00
}
2013-06-24 01:58:33 +08:00
2014-11-24 03:41:03 +08:00
2013-11-13 23:39:11 +08:00
class YoutubePlaylistIE ( YoutubeBaseInfoExtractor ) :
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com playlists '
2014-02-07 02:46:26 +08:00
_VALID_URL = r """ (?x)(?:
2013-06-24 01:58:33 +08:00
( ? : https ? : / / ) ?
( ? : \w + \. ) ?
youtube \. com /
( ? :
2014-09-24 16:34:29 +08:00
( ? : course | view_play_list | my_playlists | artist | playlist | watch | embed / videoseries )
2013-06-24 01:58:33 +08:00
\? ( ? : . * ? & ) * ? ( ? : p | a | list ) =
| p /
)
2014-02-07 02:46:26 +08:00
(
2015-02-24 03:35:15 +08:00
( ? : PL | LL | EC | UU | FL | RD | UL ) ? [ 0 - 9 A - Za - z - _ ] { 10 , }
2014-11-24 03:41:03 +08:00
# Top tracks, they can also include dots
2014-02-07 02:46:26 +08:00
| ( ? : MC ) [ \w \. ] *
)
2013-06-24 01:58:33 +08:00
. *
|
2015-02-24 03:35:15 +08:00
( ( ? : PL | LL | EC | UU | FL | RD | UL ) [ 0 - 9 A - Za - z - _ ] { 10 , } )
2013-06-24 01:58:33 +08:00
) """
2014-02-21 18:19:55 +08:00
_TEMPLATE_URL = ' https://www.youtube.com/playlist?list= %s '
_VIDEO_RE = r ' href= " \ s*/watch \ ?v=(?P<id>[0-9A-Za-z_-] {11} )&[^ " ]*?index=(?P<index> \ d+) '
2014-09-13 13:51:06 +08:00
IE_NAME = ' youtube:playlist '
2014-09-13 13:19:20 +08:00
_TESTS = [ {
' url ' : ' https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re ' ,
' info_dict ' : {
' title ' : ' ytdl test PL ' ,
2014-11-10 05:32:26 +08:00
' id ' : ' PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re ' ,
2014-09-13 13:19:20 +08:00
} ,
' playlist_count ' : 3 ,
2014-09-13 13:31:48 +08:00
} , {
' url ' : ' https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx ' ,
' info_dict ' : {
2015-02-01 22:33:32 +08:00
' id ' : ' PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx ' ,
2014-09-13 13:31:48 +08:00
' title ' : ' YDL_Empty_List ' ,
} ,
' playlist_count ' : 0 ,
} , {
' note ' : ' Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list. ' ,
' url ' : ' https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC ' ,
' info_dict ' : {
' title ' : ' 29C3: Not my department ' ,
2015-02-01 22:33:32 +08:00
' id ' : ' PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC ' ,
2014-09-13 13:31:48 +08:00
} ,
' playlist_count ' : 95 ,
} , {
' note ' : ' issue #673 ' ,
' url ' : ' PLBB231211A4F62143 ' ,
' info_dict ' : {
2014-10-27 07:06:47 +08:00
' title ' : ' [OLD]Team Fortress 2 (Class-based LP) ' ,
2015-02-01 22:33:32 +08:00
' id ' : ' PLBB231211A4F62143 ' ,
2014-09-13 13:31:48 +08:00
} ,
' playlist_mincount ' : 26 ,
} , {
' note ' : ' Large playlist ' ,
' url ' : ' https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q ' ,
' info_dict ' : {
' title ' : ' Uploads from Cauchemar ' ,
2015-02-01 22:33:32 +08:00
' id ' : ' UUBABnxM4Ar9ten8Mdjj1j0Q ' ,
2014-09-13 13:31:48 +08:00
} ,
' playlist_mincount ' : 799 ,
} , {
' url ' : ' PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl ' ,
' info_dict ' : {
' title ' : ' YDL_safe_search ' ,
2015-02-01 22:33:32 +08:00
' id ' : ' PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl ' ,
2014-09-13 13:31:48 +08:00
} ,
' playlist_count ' : 2 ,
2014-09-24 16:34:29 +08:00
} , {
' note ' : ' embedded ' ,
' url ' : ' http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu ' ,
' playlist_count ' : 4 ,
' info_dict ' : {
' title ' : ' JODA15 ' ,
2015-02-01 22:33:32 +08:00
' id ' : ' PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu ' ,
2014-09-24 16:34:29 +08:00
}
2014-09-25 07:58:49 +08:00
} , {
' note ' : ' Embedded SWF player ' ,
' url ' : ' http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0 ' ,
' playlist_count ' : 4 ,
' info_dict ' : {
' title ' : ' JODA7 ' ,
2015-02-01 22:33:32 +08:00
' id ' : ' YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ ' ,
2014-09-25 07:58:49 +08:00
}
2014-12-16 02:19:15 +08:00
} , {
' note ' : ' Buggy playlist: the webpage has a " Load more " button but it doesn \' t have more videos ' ,
' url ' : ' https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA ' ,
' info_dict ' : {
2015-02-01 22:33:32 +08:00
' title ' : ' Uploads from Interstellar Movie ' ,
' id ' : ' UUXw-G3eDE9trcvY2sBMM_aA ' ,
2014-12-16 02:19:15 +08:00
} ,
' playlist_mincout ' : 21 ,
2014-09-13 13:19:20 +08:00
} ]
2013-06-24 01:58:33 +08:00
2013-11-13 23:39:11 +08:00
def _real_initialize ( self ) :
self . _login ( )
2013-11-27 04:35:03 +08:00
def _extract_mix ( self , playlist_id ) :
2015-02-24 03:35:15 +08:00
# The mixes are generated from a single video
2013-11-27 04:35:03 +08:00
# the id of the playlist is just 'RD' + video_id
2013-12-07 02:48:54 +08:00
url = ' https://youtube.com/watch?v= %s &list= %s ' % ( playlist_id [ - 11 : ] , playlist_id )
2014-09-01 07:00:40 +08:00
webpage = self . _download_webpage (
2014-09-13 13:51:06 +08:00
url , playlist_id , ' Downloading Youtube mix ' )
2014-02-24 00:17:36 +08:00
search_title = lambda class_name : get_element_by_attribute ( ' class ' , class_name , webpage )
2014-09-01 07:00:40 +08:00
title_span = (
search_title ( ' playlist-title ' ) or
search_title ( ' title long-title ' ) or
search_title ( ' title ' ) )
2013-11-28 03:01:51 +08:00
title = clean_html ( title_span )
2014-09-01 07:00:40 +08:00
ids = orderedSet ( re . findall (
r ''' (?xs)data-video-username= " .*? " .*?
href = " /watch \ ?v=([0-9A-Za-z_-] {11} )&[^ " ] * ? list = % s ''' % r e.escape(playlist_id),
webpage ) )
2013-11-27 04:35:03 +08:00
url_results = self . _ids_to_results ( ids )
return self . playlist_result ( url_results , playlist_id , title )
2015-03-26 23:41:09 +08:00
def _extract_playlist ( self , playlist_id ) :
2014-02-21 18:19:55 +08:00
url = self . _TEMPLATE_URL % playlist_id
page = self . _download_webpage ( url , playlist_id )
2015-04-28 23:07:56 +08:00
for match in re . findall ( r ' <div class= " yt-alert-message " >([^<]+)</div> ' , page ) :
match = match . strip ( )
# Check if the playlist exists or is private
if re . match ( r ' [^<]*(The|This) playlist (does not exist|is private)[^<]* ' , match ) :
raise ExtractorError (
' The playlist doesn \' t exist or is private, use --username or '
' --netrc to access it. ' ,
expected = True )
elif re . match ( r ' [^<]*Invalid parameters[^<]* ' , match ) :
raise ExtractorError (
' Invalid parameters. Maybe URL is incorrect. ' ,
expected = True )
elif re . match ( r ' [^<]*Choose your language[^<]* ' , match ) :
continue
else :
self . report_warning ( ' Youtube gives an alert message: ' + match )
2014-05-01 21:40:35 +08:00
2013-11-13 23:21:24 +08:00
# Extract the video ids from the playlist pages
2015-06-10 05:49:11 +08:00
def _entries ( ) :
more_widget_html = content_html = page
for page_num in itertools . count ( 1 ) :
matches = re . finditer ( self . _VIDEO_RE , content_html )
# We remove the duplicates and the link with index 0
# (it's not the first video of the playlist)
new_ids = orderedSet ( m . group ( ' id ' ) for m in matches if m . group ( ' index ' ) != ' 0 ' )
for vid_id in new_ids :
yield self . url_result ( vid_id , ' Youtube ' , video_id = vid_id )
mobj = re . search ( r ' data-uix-load-more-href= " /?(?P<more>[^ " ]+) " ' , more_widget_html )
if not mobj :
break
more = self . _download_json (
' https://youtube.com/ %s ' % mobj . group ( ' more ' ) , playlist_id ,
' Downloading page # %s ' % page_num ,
transform_source = uppercase_escape )
content_html = more [ ' content_html ' ]
if not content_html . strip ( ) :
# Some webpages show a "Load more" button but they don't
# have more videos
break
more_widget_html = more [ ' load_more_widget_html ' ]
2014-02-21 18:19:55 +08:00
playlist_title = self . _html_search_regex (
2014-04-04 08:23:36 +08:00
r ' (?s)<h1 class= " pl-header-title[^ " ]* " > \ s*(.*?) \ s*</h1> ' ,
2014-09-13 13:51:06 +08:00
page , ' title ' )
2013-06-24 01:58:33 +08:00
2015-06-10 05:49:11 +08:00
return self . playlist_result ( _entries ( ) , playlist_id , playlist_title )
2013-06-24 01:58:33 +08:00
2015-03-26 23:41:09 +08:00
def _real_extract ( self , url ) :
# Extract playlist id
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
raise ExtractorError ( ' Invalid URL: %s ' % url )
playlist_id = mobj . group ( 1 ) or mobj . group ( 2 )
# Check if it's a video-specific URL
query_dict = compat_urlparse . parse_qs ( compat_urlparse . urlparse ( url ) . query )
if ' v ' in query_dict :
video_id = query_dict [ ' v ' ] [ 0 ]
if self . _downloader . params . get ( ' noplaylist ' ) :
self . to_screen ( ' Downloading just video %s because of --no-playlist ' % video_id )
return self . url_result ( video_id , ' Youtube ' , video_id = video_id )
else :
self . to_screen ( ' Downloading playlist %s - add --no-playlist to just download video %s ' % ( playlist_id , video_id ) )
if playlist_id . startswith ( ' RD ' ) or playlist_id . startswith ( ' UL ' ) :
# Mixes require a custom extraction process
return self . _extract_mix ( playlist_id )
return self . _extract_playlist ( playlist_id )
2013-06-24 01:58:33 +08:00
class YoutubeChannelIE ( InfoExtractor ) :
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com channels '
2014-12-06 19:20:54 +08:00
_VALID_URL = r ' https?://(?:youtu \ .be|(?: \ w+ \ .)?youtube(?:-nocookie)? \ .com)/channel/(?P<id>[0-9A-Za-z_-]+) '
2015-04-22 00:36:41 +08:00
_TEMPLATE_URL = ' https://www.youtube.com/channel/ %s /videos '
2014-09-13 13:51:06 +08:00
IE_NAME = ' youtube:channel '
2014-09-24 16:25:47 +08:00
_TESTS = [ {
' note ' : ' paginated channel ' ,
' url ' : ' https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w ' ,
' playlist_mincount ' : 91 ,
2015-02-01 22:33:32 +08:00
' info_dict ' : {
' id ' : ' UCKfVa3S1e4PHvxWcwyMMg8w ' ,
}
2014-09-24 16:25:47 +08:00
} ]
2013-06-24 01:58:33 +08:00
2015-04-22 00:42:21 +08:00
@staticmethod
def extract_videos_from_page ( page ) :
2013-06-24 01:58:33 +08:00
ids_in_page = [ ]
2015-04-13 01:19:00 +08:00
titles_in_page = [ ]
for mobj in re . finditer ( r ' (?:title= " (?P<title>[^ " ]+) " [^>]+)?href= " /watch \ ?v=(?P<id>[0-9A-Za-z_-]+)&? ' , page ) :
video_id = mobj . group ( ' id ' )
video_title = unescapeHTML ( mobj . group ( ' title ' ) )
try :
idx = ids_in_page . index ( video_id )
if video_title and not titles_in_page [ idx ] :
titles_in_page [ idx ] = video_title
except ValueError :
ids_in_page . append ( video_id )
titles_in_page . append ( video_title )
return zip ( ids_in_page , titles_in_page )
2013-06-24 01:58:33 +08:00
def _real_extract ( self , url ) :
2014-12-06 19:20:54 +08:00
channel_id = self . _match_id ( url )
2013-06-24 01:58:33 +08:00
2015-04-22 00:36:41 +08:00
url = self . _TEMPLATE_URL % channel_id
2015-05-30 20:29:16 +08:00
# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
# Workaround by extracting as a playlist if managed to obtain channel playlist URL
# otherwise fallback on channel by page extraction
channel_page = self . _download_webpage (
url + ' ?view=57 ' , channel_id ,
' Downloading channel page ' , fatal = False )
2015-06-06 08:25:37 +08:00
channel_playlist_id = self . _html_search_meta (
' channelId ' , channel_page , ' channel id ' , default = None )
if not channel_playlist_id :
channel_playlist_id = self . _search_regex (
r ' data-channel-external-id= " ([^ " ]+) " ' ,
channel_page , ' channel id ' , default = None )
2015-05-30 20:29:16 +08:00
if channel_playlist_id and channel_playlist_id . startswith ( ' UC ' ) :
playlist_id = ' UU ' + channel_playlist_id [ 2 : ]
2015-05-31 04:50:22 +08:00
return self . url_result (
compat_urlparse . urljoin ( url , ' /playlist?list= %s ' % playlist_id ) , ' YoutubePlaylist ' )
2015-05-30 20:29:16 +08:00
2015-04-22 00:37:45 +08:00
channel_page = self . _download_webpage ( url , channel_id , ' Downloading page #1 ' )
2013-12-08 14:30:42 +08:00
autogenerated = re . search ( r ''' (?x)
class = " [^ " ] * ? ( ? :
channel - header - autogenerated - label |
yt - channel - title - autogenerated
) [ ^ " ]* " ''' , channel_page) is not None
2013-06-24 01:58:33 +08:00
2013-11-15 18:51:45 +08:00
if autogenerated :
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
2014-12-06 21:02:19 +08:00
entries = [
2015-04-13 01:19:00 +08:00
self . url_result (
video_id , ' Youtube ' , video_id = video_id ,
video_title = video_title )
2015-04-13 22:28:16 +08:00
for video_id , video_title in self . extract_videos_from_page ( channel_page ) ]
2014-12-06 21:02:19 +08:00
return self . playlist_result ( entries , channel_id )
def _entries ( ) :
2014-12-13 05:23:54 +08:00
more_widget_html = content_html = channel_page
2013-11-15 18:51:45 +08:00
for pagenum in itertools . count ( 1 ) :
2014-02-10 00:56:10 +08:00
2015-04-13 22:28:16 +08:00
for video_id , video_title in self . extract_videos_from_page ( content_html ) :
2014-12-06 21:02:19 +08:00
yield self . url_result (
2015-04-13 01:19:00 +08:00
video_id , ' Youtube ' , video_id = video_id ,
video_title = video_title )
2014-11-24 03:41:03 +08:00
2014-12-13 05:23:54 +08:00
mobj = re . search (
r ' data-uix-load-more-href= " /?(?P<more>[^ " ]+) " ' ,
more_widget_html )
if not mobj :
2013-11-15 18:51:45 +08:00
break
2013-06-24 01:58:33 +08:00
2014-12-13 05:23:54 +08:00
more = self . _download_json (
' https://youtube.com/ %s ' % mobj . group ( ' more ' ) , channel_id ,
' Downloading page # %s ' % ( pagenum + 1 ) ,
transform_source = uppercase_escape )
content_html = more [ ' content_html ' ]
more_widget_html = more [ ' load_more_widget_html ' ]
2014-12-06 21:02:19 +08:00
return self . playlist_result ( _entries ( ) , channel_id )
2013-06-24 01:58:33 +08:00
2015-04-22 00:36:41 +08:00
class YoutubeUserIE ( YoutubeChannelIE ) :
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com user videos (URL or " ytuser " keyword) '
2014-12-06 19:20:54 +08:00
_VALID_URL = r ' (?:(?:(?:https?://)?(?: \ w+ \ .)?youtube \ .com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+) '
2015-04-22 00:36:41 +08:00
_TEMPLATE_URL = ' https://www.youtube.com/user/ %s /videos '
2014-09-13 13:51:06 +08:00
IE_NAME = ' youtube:user '
2013-06-24 01:58:33 +08:00
2014-09-24 16:25:47 +08:00
_TESTS = [ {
' url ' : ' https://www.youtube.com/user/TheLinuxFoundation ' ,
' playlist_mincount ' : 320 ,
' info_dict ' : {
' title ' : ' TheLinuxFoundation ' ,
}
} , {
' url ' : ' ytuser:phihag ' ,
' only_matching ' : True ,
} ]
2013-09-06 22:24:24 +08:00
@classmethod
2013-09-06 04:38:23 +08:00
def suitable ( cls , url ) :
2013-09-06 22:24:24 +08:00
# Don't return True if the url can be extracted with other youtube
# extractor, the regex would is too permissive and it would match.
other_ies = iter ( klass for ( name , klass ) in globals ( ) . items ( ) if name . endswith ( ' IE ' ) and klass is not cls )
2014-11-24 03:41:03 +08:00
if any ( ie . suitable ( url ) for ie in other_ies ) :
return False
else :
return super ( YoutubeUserIE , cls ) . suitable ( url )
2013-09-06 04:38:23 +08:00
2013-06-24 02:28:15 +08:00
2015-04-22 01:30:31 +08:00
class YoutubeSearchIE ( SearchInfoExtractor , YoutubePlaylistIE ) :
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com searches '
2015-04-22 01:30:31 +08:00
# there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results
_MAX_RESULTS = float ( ' inf ' )
2014-09-13 13:51:06 +08:00
IE_NAME = ' youtube:search '
2013-06-24 02:28:15 +08:00
_SEARCH_KEY = ' ytsearch '
2015-04-22 01:30:31 +08:00
_EXTRA_QUERY_ARGS = { }
2015-04-22 22:28:33 +08:00
_TESTS = [ ]
2013-06-24 02:28:15 +08:00
def _get_n_results ( self , query , n ) :
""" Get a specified number of results for a query """
2015-04-22 01:30:31 +08:00
videos = [ ]
2013-06-24 02:28:15 +08:00
limit = n
2015-04-22 01:30:31 +08:00
for pagenum in itertools . count ( 1 ) :
url_query = {
2015-06-15 23:01:26 +08:00
' search_query ' : query . encode ( ' utf-8 ' ) ,
2015-04-22 01:30:31 +08:00
' page ' : pagenum ,
' spf ' : ' navigate ' ,
}
url_query . update ( self . _EXTRA_QUERY_ARGS )
result_url = ' https://www.youtube.com/results? ' + compat_urllib_parse . urlencode ( url_query )
data = self . _download_json (
2014-09-24 15:51:45 +08:00
result_url , video_id = ' query " %s " ' % query ,
2015-04-22 01:30:31 +08:00
note = ' Downloading page %s ' % pagenum ,
2014-09-24 15:51:45 +08:00
errnote = ' Unable to download API page ' )
2015-04-22 01:30:31 +08:00
html_content = data [ 1 ] [ ' body ' ] [ ' content ' ]
2013-12-09 08:49:01 +08:00
2015-04-22 01:30:31 +08:00
if ' class= " search-message ' in html_content :
2014-02-15 23:30:11 +08:00
raise ExtractorError (
2014-09-13 13:51:06 +08:00
' [youtube] No video results ' , expected = True )
2013-06-24 02:28:15 +08:00
2015-04-22 01:30:31 +08:00
new_videos = self . _ids_to_results ( orderedSet ( re . findall (
r ' href= " /watch \ ?v=(. {11} ) ' , html_content ) ) )
videos + = new_videos
if not new_videos or len ( videos ) > limit :
break
2013-06-24 02:28:15 +08:00
2015-04-22 01:30:31 +08:00
if len ( videos ) > n :
videos = videos [ : n ]
2013-06-24 02:28:15 +08:00
return self . playlist_result ( videos , query )
2013-07-01 23:59:28 +08:00
2014-03-04 10:32:28 +08:00
2013-11-03 10:40:48 +08:00
class YoutubeSearchDateIE ( YoutubeSearchIE ) :
2013-12-03 20:55:25 +08:00
IE_NAME = YoutubeSearchIE . IE_NAME + ' :date '
2013-11-03 10:40:48 +08:00
_SEARCH_KEY = ' ytsearchdate '
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com searches, newest videos first '
2015-04-22 01:30:31 +08:00
_EXTRA_QUERY_ARGS = { ' search_sort ' : ' video_date_uploaded ' }
2013-07-01 23:59:28 +08:00
2014-03-04 10:32:28 +08:00
class YoutubeSearchURLIE ( InfoExtractor ) :
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com search URLs '
IE_NAME = ' youtube:search_url '
2014-03-04 10:32:28 +08:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/results \ ?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$) '
2014-09-24 16:25:47 +08:00
_TESTS = [ {
' url ' : ' https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video ' ,
' playlist_mincount ' : 5 ,
' info_dict ' : {
' title ' : ' youtube-dl test video ' ,
}
} ]
2014-03-04 10:32:28 +08:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
query = compat_urllib_parse . unquote_plus ( mobj . group ( ' query ' ) )
webpage = self . _download_webpage ( url , query )
result_code = self . _search_regex (
2015-03-07 20:59:06 +08:00
r ' (?s)<ol[^>]+class= " item-section " (.*?)</ol> ' , webpage , ' result HTML ' )
2014-03-04 10:32:28 +08:00
part_codes = re . findall (
r ' (?s)<h3 class= " yt-lockup-title " >(.*?)</h3> ' , result_code )
entries = [ ]
for part_code in part_codes :
part_title = self . _html_search_regex (
2014-07-04 20:21:19 +08:00
[ r ' (?s)title= " ([^ " ]+) " ' , r ' >([^<]+)</a> ' ] , part_code , ' item title ' , fatal = False )
2014-03-04 10:32:28 +08:00
part_url_snippet = self . _html_search_regex (
r ' (?s)href= " ([^ " ]+) " ' , part_code , ' item URL ' )
part_url = compat_urlparse . urljoin (
' https://www.youtube.com/ ' , part_url_snippet )
entries . append ( {
' _type ' : ' url ' ,
' url ' : part_url ,
' title ' : part_title ,
} )
return {
' _type ' : ' playlist ' ,
' entries ' : entries ,
' title ' : query ,
}
2013-07-01 23:59:28 +08:00
class YoutubeShowIE ( InfoExtractor ) :
2014-09-13 13:51:06 +08:00
IE_DESC = ' YouTube.com (multi-season) shows '
2014-09-24 16:25:47 +08:00
_VALID_URL = r ' https?://www \ .youtube \ .com/show/(?P<id>[^?#]*) '
2014-09-13 13:51:06 +08:00
IE_NAME = ' youtube:show '
2014-09-24 16:25:47 +08:00
_TESTS = [ {
' url ' : ' http://www.youtube.com/show/airdisasters ' ,
' playlist_mincount ' : 3 ,
' info_dict ' : {
' id ' : ' airdisasters ' ,
' title ' : ' Air Disasters ' ,
}
} ]
2013-07-01 23:59:28 +08:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
2014-09-24 16:25:47 +08:00
playlist_id = mobj . group ( ' id ' )
webpage = self . _download_webpage (
url , playlist_id , ' Downloading show webpage ' )
2013-07-01 23:59:28 +08:00
# There's one playlist for each season of the show
m_seasons = list ( re . finditer ( r ' href= " (/playlist \ ?list=.*?) " ' , webpage ) )
2014-09-24 16:25:47 +08:00
self . to_screen ( ' %s : Found %s seasons ' % ( playlist_id , len ( m_seasons ) ) )
entries = [
self . url_result (
' https://www.youtube.com ' + season . group ( 1 ) , ' YoutubePlaylist ' )
for season in m_seasons
]
title = self . _og_search_title ( webpage , fatal = False )
return {
' _type ' : ' playlist ' ,
' id ' : playlist_id ,
' title ' : title ,
' entries ' : entries ,
}
2013-07-07 19:58:23 +08:00
2013-07-25 02:40:12 +08:00
class YoutubeFeedsInfoExtractor ( YoutubeBaseInfoExtractor ) :
2013-07-21 01:33:40 +08:00
"""
2015-05-15 23:06:59 +08:00
Base class for feed extractors
2013-07-21 01:33:40 +08:00
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties .
"""
2013-07-25 02:40:12 +08:00
_LOGIN_REQUIRED = True
2013-07-21 01:33:40 +08:00
@property
def IE_NAME ( self ) :
2014-09-13 13:51:06 +08:00
return ' youtube: %s ' % self . _FEED_NAME
2013-07-07 19:58:23 +08:00
2013-07-08 17:23:05 +08:00
def _real_initialize ( self ) :
2013-07-25 02:40:12 +08:00
self . _login ( )
2013-07-08 17:23:05 +08:00
2013-07-07 19:58:23 +08:00
def _real_extract ( self , url ) :
2015-05-15 23:06:59 +08:00
page = self . _download_webpage (
' https://www.youtube.com/feed/ %s ' % self . _FEED_NAME , self . _PLAYLIST_TITLE )
2015-05-15 05:41:27 +08:00
# The extraction process is the same as for playlists, but the regex
# for the video ids doesn't contain an index
ids = [ ]
more_widget_html = content_html = page
for page_num in itertools . count ( 1 ) :
matches = re . findall ( r ' href= " \ s*/watch \ ?v=([0-9A-Za-z_-] {11} ) ' , content_html )
2015-05-15 23:42:34 +08:00
# 'recommended' feed has infinite 'load more' and each new portion spins
# the same videos in (sometimes) slightly different order, so we'll check
# for unicity and break when portion has no new videos
new_ids = filter ( lambda video_id : video_id not in ids , orderedSet ( matches ) )
if not new_ids :
break
2015-05-15 05:41:27 +08:00
ids . extend ( new_ids )
mobj = re . search ( r ' data-uix-load-more-href= " /?(?P<more>[^ " ]+) " ' , more_widget_html )
if not mobj :
break
more = self . _download_json (
2015-05-15 23:06:59 +08:00
' https://youtube.com/ %s ' % mobj . group ( ' more ' ) , self . _PLAYLIST_TITLE ,
2015-05-15 05:41:27 +08:00
' Downloading page # %s ' % page_num ,
transform_source = uppercase_escape )
content_html = more [ ' content_html ' ]
more_widget_html = more [ ' load_more_widget_html ' ]
2015-05-15 23:06:59 +08:00
return self . playlist_result (
self . _ids_to_results ( ids ) , playlist_title = self . _PLAYLIST_TITLE )
class YoutubeWatchLaterIE ( YoutubePlaylistIE ) :
IE_NAME = ' youtube:watchlater '
IE_DESC = ' Youtube watch later list, " :ytwatchlater " for short (requires authentication) '
_VALID_URL = r ' https?://www \ .youtube \ .com/(?:feed/watch_later|playlist \ ?list=WL)|:ytwatchlater '
_TESTS = [ ] # override PlaylistIE tests
def _real_extract ( self , url ) :
return self . _extract_playlist ( ' WL ' )
2013-11-24 21:33:50 +08:00
2014-11-24 03:41:03 +08:00
2013-07-25 02:45:19 +08:00
class YoutubeFavouritesIE ( YoutubeBaseInfoExtractor ) :
2014-09-13 13:51:06 +08:00
IE_NAME = ' youtube:favorites '
2014-11-24 03:09:10 +08:00
IE_DESC = ' YouTube.com favourite videos, " :ytfav " for short (requires authentication) '
2013-08-31 02:13:05 +08:00
_VALID_URL = r ' https?://www \ .youtube \ .com/my_favorites|:ytfav(?:ou?rites)? '
2013-07-25 02:45:19 +08:00
_LOGIN_REQUIRED = True
def _real_extract ( self , url ) :
webpage = self . _download_webpage ( ' https://www.youtube.com/my_favorites ' , ' Youtube Favourites videos ' )
2014-09-13 13:51:06 +08:00
playlist_id = self . _search_regex ( r ' list=(.+?)[ " &] ' , webpage , ' favourites playlist id ' )
2013-07-25 02:45:19 +08:00
return self . url_result ( playlist_id , ' YoutubePlaylist ' )
2013-10-07 18:21:24 +08:00
2015-05-15 23:06:59 +08:00
class YoutubeRecommendedIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = ' YouTube.com recommended videos, " :ytrec " for short (requires authentication) '
_VALID_URL = r ' https?://www \ .youtube \ .com/feed/recommended|:ytrec(?:ommended)? '
_FEED_NAME = ' recommended '
_PLAYLIST_TITLE = ' Youtube Recommended videos '
2014-09-01 05:44:43 +08:00
2015-05-15 23:06:59 +08:00
class YoutubeSubscriptionsIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = ' YouTube.com subscriptions feed, " ytsubs " keyword (requires authentication) '
_VALID_URL = r ' https?://www \ .youtube \ .com/feed/subscriptions|:ytsubs(?:criptions)? '
_FEED_NAME = ' subscriptions '
_PLAYLIST_TITLE = ' Youtube Subscriptions '
2014-09-01 05:44:43 +08:00
2015-05-15 23:06:59 +08:00
class YoutubeHistoryIE ( YoutubeFeedsInfoExtractor ) :
IE_DESC = ' Youtube watch history, " :ythistory " for short (requires authentication) '
_VALID_URL = ' https?://www \ .youtube \ .com/feed/history|:ythistory '
_FEED_NAME = ' history '
_PLAYLIST_TITLE = ' Youtube History '
2014-09-01 05:44:43 +08:00
2013-10-07 18:21:24 +08:00
class YoutubeTruncatedURLIE ( InfoExtractor ) :
IE_NAME = ' youtube:truncated_url '
IE_DESC = False # Do not list
2014-01-23 23:14:54 +08:00
_VALID_URL = r ''' (?x)
2015-01-24 18:42:20 +08:00
( ? : https ? : / / ) ?
( ? : \w + \. ) ? [ yY ] [ oO ] [ uU ] [ tT ] [ uU ] [ bB ] [ eE ] ( ? : - nocookie ) ? \. com /
( ? : watch \? ( ? :
2014-07-01 21:48:18 +08:00
feature = [ a - z_ ] + |
2015-01-24 18:42:20 +08:00
annotation_id = annotation_ [ ^ & ] + |
x - yt - cl = [ 0 - 9 ] + |
2015-01-30 10:45:29 +08:00
hl = [ ^ & ] * |
2015-01-24 18:42:20 +08:00
) ?
|
attribution_link \? a = [ ^ & ] +
)
$
2014-01-23 23:14:54 +08:00
'''
2013-10-07 18:21:24 +08:00
2014-07-01 21:48:18 +08:00
_TESTS = [ {
' url ' : ' http://www.youtube.com/watch?annotation_id=annotation_3951667041 ' ,
' only_matching ' : True ,
2014-07-01 21:49:34 +08:00
} , {
' url ' : ' http://www.youtube.com/watch? ' ,
' only_matching ' : True ,
2015-01-24 18:42:20 +08:00
} , {
' url ' : ' https://www.youtube.com/watch?x-yt-cl=84503534 ' ,
' only_matching ' : True ,
} , {
' url ' : ' https://www.youtube.com/watch?feature=foo ' ,
' only_matching ' : True ,
2015-01-30 10:45:29 +08:00
} , {
' url ' : ' https://www.youtube.com/watch?hl=en-GB ' ,
' only_matching ' : True ,
2014-07-01 21:48:18 +08:00
} ]
2013-10-07 18:21:24 +08:00
def _real_extract ( self , url ) :
raise ExtractorError (
2014-09-13 13:51:06 +08:00
' Did you forget to quote the URL? Remember that & is a meta '
' character in most shells, so you want to put the URL in quotes, '
' like youtube-dl '
' " http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc " '
' or simply youtube-dl BaW_jenozKc . ' ,
2013-10-07 18:21:24 +08:00
expected = True )
2015-01-02 06:44:39 +08:00
class YoutubeTruncatedIDIE ( InfoExtractor ) :
IE_NAME = ' youtube:truncated_id '
IE_DESC = False # Do not list
2015-01-24 18:42:20 +08:00
_VALID_URL = r ' https?://(?:www \ .)?youtube \ .com/watch \ ?v=(?P<id>[0-9A-Za-z_-] { 1,10})$ '
2015-01-02 06:44:39 +08:00
_TESTS = [ {
' url ' : ' https://www.youtube.com/watch?v=N_708QY7Ob ' ,
' only_matching ' : True ,
} ]
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
raise ExtractorError (
' Incomplete YouTube ID %s . URL %s looks truncated. ' % ( video_id , url ) ,
expected = True )