2013-12-10 00:08:58 +08:00
# encoding: utf-8
2014-01-17 10:29:41 +08:00
from __future__ import unicode_literals
2013-06-24 02:57:44 +08:00
import json
import re
2013-09-15 03:41:49 +08:00
import itertools
2013-06-24 02:57:44 +08:00
from . common import InfoExtractor
from . . utils import (
compat_str ,
2013-08-21 23:06:37 +08:00
compat_urlparse ,
2013-09-15 03:41:49 +08:00
compat_urllib_parse ,
2013-06-24 02:57:44 +08:00
ExtractorError ,
unified_strdate ,
)
class SoundcloudIE ( InfoExtractor ) :
""" Information extractor for soundcloud.com
To access the media , the uid of the song and a stream token
must be extracted from the page source and the script must make
a request to media . soundcloud . com / crossdomain . xml . Then
the media can be grabbed by requesting from an url composed
of the stream token and uid
"""
2014-05-05 09:12:41 +08:00
_VALID_URL = r ''' (?x)^(?:https?://)?
2013-12-19 23:39:01 +08:00
( ? : ( ? : ( ? : www \. | m \. ) ? soundcloud \. com /
2013-12-10 02:57:00 +08:00
( ? P < uploader > [ \w \d - ] + ) /
( ? ! sets / ) ( ? P < title > [ \w \d - ] + ) / ?
2013-12-10 00:08:58 +08:00
( ? P < token > [ ^ ? ] + ? ) ? ( ? : [ ? ] . * ) ? $ )
2013-07-24 20:39:21 +08:00
| ( ? : api \. soundcloud \. com / tracks / ( ? P < track_id > \d + ) )
2014-01-02 23:18:51 +08:00
| ( ? P < player > ( ? : w | player | p . ) \. soundcloud \. com / player / ? . * ? url = . * )
2013-07-24 20:39:21 +08:00
)
'''
2014-01-17 10:29:41 +08:00
IE_NAME = ' soundcloud '
2013-11-10 01:06:09 +08:00
_TESTS = [
{
2014-01-17 10:29:41 +08:00
' url ' : ' http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy ' ,
' file ' : ' 62986583.mp3 ' ,
' md5 ' : ' ebef0a451b909710ed1d7787dddbf0d7 ' ,
' info_dict ' : {
" upload_date " : " 20121011 " ,
" description " : " No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o ' d " ,
" uploader " : " E.T. ExTerrestrial Music " ,
" title " : " Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1 "
2013-11-10 01:06:09 +08:00
}
} ,
# not streamable song
{
2014-01-17 10:29:41 +08:00
' url ' : ' https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep ' ,
' info_dict ' : {
' id ' : ' 47127627 ' ,
' ext ' : ' mp3 ' ,
' title ' : ' Goldrushed ' ,
2014-03-09 19:20:34 +08:00
' description ' : ' From Stockholm Sweden \r \n Povel / Magnus / Filip / David \r \n www.theroyalconcept.com ' ,
2014-01-17 10:29:41 +08:00
' uploader ' : ' The Royal Concept ' ,
' upload_date ' : ' 20120521 ' ,
2013-11-10 01:06:09 +08:00
} ,
2014-01-17 10:29:41 +08:00
' params ' : {
2013-11-10 01:06:09 +08:00
# rtmp
2014-01-17 10:29:41 +08:00
' skip_download ' : True ,
2013-11-10 01:06:09 +08:00
} ,
} ,
2013-12-10 00:08:58 +08:00
# private link
{
2014-01-17 10:29:41 +08:00
' url ' : ' https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp ' ,
' md5 ' : ' aa0dd32bfea9b0c5ef4f02aacd080604 ' ,
' info_dict ' : {
' id ' : ' 123998367 ' ,
' ext ' : ' mp3 ' ,
' title ' : ' Youtube - Dl Test Video \' \' Ä↭ ' ,
' uploader ' : ' jaimeMF ' ,
' description ' : ' test chars: \" \' / \\ ä↭ ' ,
' upload_date ' : ' 20131209 ' ,
2013-12-10 00:08:58 +08:00
} ,
} ,
2013-12-10 20:04:21 +08:00
# downloadable song
{
2014-01-17 10:29:41 +08:00
' url ' : ' https://soundcloud.com/simgretina/just-your-problem-baby-1 ' ,
' md5 ' : ' 56a8b69568acaa967b4c49f9d1d52d19 ' ,
' info_dict ' : {
' id ' : ' 105614606 ' ,
' ext ' : ' wav ' ,
' title ' : ' Just Your Problem Baby (Acapella) ' ,
' description ' : ' Vocals ' ,
' uploader ' : ' Sim Gretina ' ,
' upload_date ' : ' 20130815 ' ,
2013-12-10 20:04:21 +08:00
} ,
} ,
2013-11-10 01:06:09 +08:00
]
2013-06-24 02:57:44 +08:00
2013-07-24 20:05:14 +08:00
_CLIENT_ID = ' b45b1aa10f1ac2941910a7f0d10f8e28 '
2013-11-21 20:16:19 +08:00
_IPHONE_CLIENT_ID = ' 376f225bf427445fc4bfb6b99b72e0bf '
2013-07-24 20:05:14 +08:00
2013-06-24 02:57:44 +08:00
def report_resolve ( self , video_id ) :
""" Report information extraction. """
2014-03-24 09:15:31 +08:00
self . to_screen ( ' %s : Resolving id ' % video_id )
2013-06-24 02:57:44 +08:00
2013-07-24 20:05:14 +08:00
@classmethod
def _resolv_url ( cls , url ) :
return ' http://api.soundcloud.com/resolve.json?url= ' + url + ' &client_id= ' + cls . _CLIENT_ID
2013-12-10 00:08:58 +08:00
def _extract_info_dict ( self , info , full_title = None , quiet = False , secret_token = None ) :
2013-11-10 01:06:09 +08:00
track_id = compat_str ( info [ ' id ' ] )
name = full_title or track_id
2013-11-26 03:30:41 +08:00
if quiet :
2013-09-15 03:41:49 +08:00
self . report_extraction ( name )
2013-07-24 20:05:14 +08:00
thumbnail = info [ ' artwork_url ' ]
if thumbnail is not None :
thumbnail = thumbnail . replace ( ' -large ' , ' -t500x500 ' )
2014-01-17 10:29:41 +08:00
ext = ' mp3 '
2013-11-10 01:06:09 +08:00
result = {
2013-11-26 03:30:41 +08:00
' id ' : track_id ,
2013-07-24 20:05:14 +08:00
' uploader ' : info [ ' user ' ] [ ' username ' ] ,
' upload_date ' : unified_strdate ( info [ ' created_at ' ] ) ,
2013-11-26 03:30:41 +08:00
' title ' : info [ ' title ' ] ,
2013-07-24 20:05:14 +08:00
' description ' : info [ ' description ' ] ,
' thumbnail ' : thumbnail ,
}
2014-03-24 09:21:17 +08:00
formats = [ ]
2013-11-10 01:06:09 +08:00
if info . get ( ' downloadable ' , False ) :
2013-11-21 20:16:19 +08:00
# We can build a direct link to the song
2013-11-26 03:30:41 +08:00
format_url = (
2014-01-17 10:29:41 +08:00
' https://api.soundcloud.com/tracks/ {0} /download?client_id= {1} ' . format (
2013-11-26 03:30:41 +08:00
track_id , self . _CLIENT_ID ) )
2014-03-24 09:21:17 +08:00
formats . append ( {
2013-11-26 03:30:41 +08:00
' format_id ' : ' download ' ,
2014-01-17 10:29:41 +08:00
' ext ' : info . get ( ' original_format ' , ' mp3 ' ) ,
2013-11-26 03:30:41 +08:00
' url ' : format_url ,
2013-11-26 05:34:56 +08:00
' vcodec ' : ' none ' ,
2014-03-24 09:21:17 +08:00
' preference ' : 10 ,
} )
# We have to retrieve the url
streams_url = ( ' http://api.soundcloud.com/i1/tracks/ {0} /streams? '
' client_id= {1} &secret_token= {2} ' . format ( track_id , self . _IPHONE_CLIENT_ID , secret_token ) )
2014-05-05 09:12:41 +08:00
format_dict = self . _download_json (
2014-03-24 09:21:17 +08:00
streams_url ,
track_id , ' Downloading track url ' )
for key , stream_url in format_dict . items ( ) :
if key . startswith ( ' http ' ) :
formats . append ( {
' format_id ' : key ,
' ext ' : ext ,
' url ' : stream_url ,
' vcodec ' : ' none ' ,
} )
elif key . startswith ( ' rtmp ' ) :
# The url doesn't have an rtmp app, we have to extract the playpath
url , path = stream_url . split ( ' mp3: ' , 1 )
formats . append ( {
' format_id ' : key ,
' url ' : url ,
' play_path ' : ' mp3: ' + path ,
' ext ' : ext ,
' vcodec ' : ' none ' ,
} )
2013-11-26 03:30:41 +08:00
if not formats :
2013-11-21 20:16:19 +08:00
# We fallback to the stream_url in the original info, this
# cannot be always used, sometimes it can give an HTTP 404 error
2013-11-26 03:30:41 +08:00
formats . append ( {
2014-01-17 10:29:41 +08:00
' format_id ' : ' fallback ' ,
2013-11-26 03:30:41 +08:00
' url ' : info [ ' stream_url ' ] + ' ?client_id= ' + self . _CLIENT_ID ,
' ext ' : ext ,
2013-11-26 05:34:56 +08:00
' vcodec ' : ' none ' ,
2013-11-26 03:30:41 +08:00
} )
2014-01-17 10:29:41 +08:00
for f in formats :
2013-11-26 03:30:41 +08:00
if f [ ' format_id ' ] . startswith ( ' http ' ) :
2014-01-17 10:29:41 +08:00
f [ ' protocol ' ] = ' http '
2013-11-26 03:30:41 +08:00
if f [ ' format_id ' ] . startswith ( ' rtmp ' ) :
2014-01-17 10:29:41 +08:00
f [ ' protocol ' ] = ' rtmp '
2013-11-26 03:30:41 +08:00
2014-01-17 10:29:41 +08:00
self . _sort_formats ( formats )
2013-11-26 03:30:41 +08:00
result [ ' formats ' ] = formats
2013-11-21 20:16:19 +08:00
2013-11-10 01:06:09 +08:00
return result
2013-07-24 20:05:14 +08:00
2013-06-24 02:57:44 +08:00
def _real_extract ( self , url ) :
2013-07-24 20:39:21 +08:00
mobj = re . match ( self . _VALID_URL , url , flags = re . VERBOSE )
2013-06-24 02:57:44 +08:00
if mobj is None :
2014-03-24 09:15:31 +08:00
raise ExtractorError ( ' Invalid URL: %s ' % url )
2013-06-24 02:57:44 +08:00
2013-07-24 20:39:21 +08:00
track_id = mobj . group ( ' track_id ' )
2013-12-10 00:08:58 +08:00
token = None
2013-07-24 20:39:21 +08:00
if track_id is not None :
info_json_url = ' http://api.soundcloud.com/tracks/ ' + track_id + ' .json?client_id= ' + self . _CLIENT_ID
full_title = track_id
2014-01-02 23:18:51 +08:00
elif mobj . group ( ' player ' ) :
2013-08-21 23:06:37 +08:00
query = compat_urlparse . parse_qs ( compat_urlparse . urlparse ( url ) . query )
2014-05-05 09:12:41 +08:00
return self . url_result ( query [ ' url ' ] [ 0 ] )
2013-07-24 20:39:21 +08:00
else :
# extract uploader (which is in the url)
2013-12-10 00:08:58 +08:00
uploader = mobj . group ( ' uploader ' )
2013-07-24 20:39:21 +08:00
# extract simple title (uploader + slug of song title)
2013-12-10 00:08:58 +08:00
slug_title = mobj . group ( ' title ' )
token = mobj . group ( ' token ' )
full_title = resolve_title = ' %s / %s ' % ( uploader , slug_title )
if token :
resolve_title + = ' / %s ' % token
2013-07-24 20:39:21 +08:00
self . report_resolve ( full_title )
2013-12-10 00:08:58 +08:00
url = ' http://soundcloud.com/ %s ' % resolve_title
2013-07-24 20:39:21 +08:00
info_json_url = self . _resolv_url ( url )
2014-05-05 09:12:41 +08:00
info = self . _download_json ( info_json_url , full_title , ' Downloading info JSON ' )
2013-06-24 02:57:44 +08:00
2013-12-10 00:08:58 +08:00
return self . _extract_info_dict ( info , full_title , secret_token = token )
2013-06-24 02:57:44 +08:00
2014-05-05 09:12:41 +08:00
2013-07-24 20:05:14 +08:00
class SoundcloudSetIE ( SoundcloudIE ) :
2014-03-04 22:21:45 +08:00
_VALID_URL = r ' https?://(?:www \ .)?soundcloud \ .com/([ \ w \ d-]+)/sets/([ \ w \ d-]+) '
2014-01-17 10:29:41 +08:00
IE_NAME = ' soundcloud:set '
2013-11-10 01:06:09 +08:00
# it's in tests/test_playlists.py
_TESTS = [ ]
2013-06-24 02:57:44 +08:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2014-03-24 09:15:31 +08:00
raise ExtractorError ( ' Invalid URL: %s ' % url )
2013-06-24 02:57:44 +08:00
# extract uploader (which is in the url)
uploader = mobj . group ( 1 )
# extract simple title (uploader + slug of song title)
2014-05-05 09:12:41 +08:00
slug_title = mobj . group ( 2 )
2013-06-24 02:57:44 +08:00
full_title = ' %s /sets/ %s ' % ( uploader , slug_title )
self . report_resolve ( full_title )
url = ' http://soundcloud.com/ %s /sets/ %s ' % ( uploader , slug_title )
2013-07-24 20:05:14 +08:00
resolv_url = self . _resolv_url ( url )
2014-05-05 09:12:41 +08:00
info = self . _download_json ( resolv_url , full_title )
2013-06-24 02:57:44 +08:00
if ' errors ' in info :
for err in info [ ' errors ' ] :
2014-03-24 09:15:31 +08:00
self . _downloader . report_error ( ' unable to download video webpage: %s ' % compat_str ( err [ ' error_message ' ] ) )
2013-06-24 02:57:44 +08:00
return
self . report_extraction ( full_title )
2013-07-24 20:05:14 +08:00
return { ' _type ' : ' playlist ' ,
' entries ' : [ self . _extract_info_dict ( track ) for track in info [ ' tracks ' ] ] ,
' id ' : info [ ' id ' ] ,
' title ' : info [ ' title ' ] ,
}
2013-09-15 03:41:49 +08:00
class SoundcloudUserIE ( SoundcloudIE ) :
2013-12-04 21:34:47 +08:00
_VALID_URL = r ' https?://(www \ .)?soundcloud \ .com/(?P<user>[^/]+)(/?(tracks/)?)?( \ ?.*)?$ '
2014-01-17 10:29:41 +08:00
IE_NAME = ' soundcloud:user '
2013-09-15 03:41:49 +08:00
# it's in tests/test_playlists.py
2013-11-10 01:06:09 +08:00
_TESTS = [ ]
2013-09-15 03:41:49 +08:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
uploader = mobj . group ( ' user ' )
url = ' http://soundcloud.com/ %s / ' % uploader
resolv_url = self . _resolv_url ( url )
2014-05-05 09:12:41 +08:00
user = self . _download_json (
resolv_url , uploader , ' Downloading user info ' )
base_url = ' http://api.soundcloud.com/users/ %s /tracks.json? ' % uploader
2013-09-15 03:41:49 +08:00
2014-05-05 09:12:41 +08:00
entries = [ ]
2013-09-15 03:41:49 +08:00
for i in itertools . count ( ) :
2014-05-05 09:12:41 +08:00
data = compat_urllib_parse . urlencode ( {
' offset ' : i * 50 ,
' client_id ' : self . _CLIENT_ID ,
} )
new_entries = self . _download_json (
base_url + data , uploader , ' Downloading track page %s ' % ( i + 1 ) )
entries . extend ( self . _extract_info_dict ( e , quiet = True ) for e in new_entries )
if len ( new_entries ) < 50 :
2013-09-15 03:41:49 +08:00
break
return {
' _type ' : ' playlist ' ,
' id ' : compat_str ( user [ ' id ' ] ) ,
' title ' : user [ ' username ' ] ,
2014-05-05 09:12:41 +08:00
' entries ' : entries ,
}
class SoundcloudPlaylistIE ( SoundcloudIE ) :
_VALID_URL = r ' https?://api \ .soundcloud \ .com/playlists/(?P<id>[0-9]+) '
IE_NAME = ' soundcloud:playlist '
# it's in tests/test_playlists.py
_TESTS = [ ]
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
playlist_id = mobj . group ( ' id ' )
base_url = ' %s //api.soundcloud.com/playlists/ %s .json? ' % ( self . http_scheme ( ) , playlist_id )
data = compat_urllib_parse . urlencode ( {
' client_id ' : self . _CLIENT_ID ,
} )
data = self . _download_json (
base_url + data , playlist_id , ' Downloading playlist ' )
entries = [
self . _extract_info_dict ( t , quiet = True ) for t in data [ ' tracks ' ] ]
return {
' _type ' : ' playlist ' ,
' id ' : playlist_id ,
' title ' : data . get ( ' title ' ) ,
' description ' : data . get ( ' description ' ) ,
' entries ' : entries ,
2013-09-15 03:41:49 +08:00
}