youtube-dl/youtube_dl/extractor/mixcloud.py

from __future__ import unicode_literals

import re
import itertools

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse,
)
from ..utils import (
    ExtractorError,
    HEADRequest,
    str_to_int,
)


class MixcloudIE(InfoExtractor):
    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
    IE_NAME = 'mixcloud'

    _TESTS = [{
        'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
        'info_dict': {
            'id': 'dholbach-cryptkeeper',
            'ext': 'mp3',
            'title': 'Cryptkeeper',
            'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
            'uploader': 'Daniel Holbach',
            'uploader_id': 'dholbach',
            'thumbnail': 're:https?://.*\.jpg',
            'view_count': int,
            'like_count': int,
        },
    }, {
        'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
        'info_dict': {
            'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
            'ext': 'mp3',
            'title': 'Caribou 7 inch Vinyl Mix & Chat',
            'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
            'uploader': 'Gilles Peterson Worldwide',
            'uploader_id': 'gillespeterson',
            'thumbnail': 're:https?://.*/images/',
            'view_count': int,
            'like_count': int,
        },
    }]

    def _get_url(self, track_id, template_url, server_number):
        boundaries = (1, 30)
        for nr in server_numbers(server_number, boundaries):
            url = template_url % nr
            try:
                # We only want to know if the request succeed
                # don't download the whole file
                self._request_webpage(
                    HEADRequest(url), track_id,
                    'Checking URL %d/%d ...' % (nr, boundaries[-1]))
                return url
            except ExtractorError:
                pass
        return None

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader = mobj.group(1)
        cloudcast_name = mobj.group(2)
        track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))

        webpage = self._download_webpage(url, track_id)

        preview_url = self._search_regex(
            r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')
        song_url = preview_url.replace('/previews/', '/c/originals/')
        server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number'))
        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
        final_song_url = self._get_url(track_id, template_url, server_number)
        if final_song_url is None:
            self.to_screen('Trying with m4a extension')
            template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
            final_song_url = self._get_url(track_id, template_url, server_number)
        if final_song_url is None:
            raise ExtractorError('Unable to extract track url')

        PREFIX = (
            r'm-play-on-spacebar[^>]+'
            r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
        title = self._html_search_regex(
            PREFIX + r'm-title="([^"]+)"', webpage, 'title')
        thumbnail = self._proto_relative_url(self._html_search_regex(
            PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail',
            fatal=False))
        uploader = self._html_search_regex(
            PREFIX + r'm-owner-name="([^"]+)"',
            webpage, 'uploader', fatal=False)
        uploader_id = self._search_regex(
            r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
        description = self._og_search_description(webpage)
        like_count = str_to_int(self._search_regex(
            r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',
            webpage, 'like count', fatal=False))
        view_count = str_to_int(self._search_regex(
            [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
             r'/listeners/?">([0-9,.]+)</a>'],
            webpage, 'play count', fatal=False))

        return {
            'id': track_id,
            'title': title,
            'url': final_song_url,
            'description': description,
            'thumbnail': thumbnail,
            'uploader': uploader,
            'uploader_id': uploader_id,
            'view_count': view_count,
            'like_count': like_count,
        }


def server_numbers(first, boundaries):
    """ Server numbers to try in descending order of probable availability.
    Starting from first (i.e. the number of the server hosting the preview file)
    and going further and further up to the higher boundary and down to the
    lower one in an alternating fashion. Namely:

        server_numbers(2, (1, 5))

        # Where the preview server is 2, min number is 1 and max is 5.
        # Yields: 2, 3, 1, 4, 5

    Why not random numbers or increasing sequences? Since from what I've seen,
    full length files seem to be hosted on servers whose number is closer to
    that of the preview; to be confirmed.
    """
    zip_longest = getattr(itertools, 'zip_longest', None)
    if zip_longest is None:
        # python 2.x
        zip_longest = itertools.izip_longest

    if len(boundaries) != 2:
        raise ValueError("boundaries should be a two-element tuple")
    min, max = boundaries
    highs = range(first + 1, max + 1)
    lows = range(first - 1, min - 1, -1)
    rest = filter(
        None, itertools.chain.from_iterable(zip_longest(highs, lows)))
    yield first
    for n in rest:
        yield n
[mixcloud] Use unicode_literals 2014-01-17 11:06:18 +08:00			`from __future__ import unicode_literals`

Move MixCloud into its own file 2013-06-24 03:59:15 +08:00			`import re`
[mixcloud] Try preview server first, then further numbers 2015-03-16 07:20:06 +08:00			`import itertools`
Move MixCloud into its own file 2013-06-24 03:59:15 +08:00
			`from .common import InfoExtractor`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`from ..compat import (`
[mixcloud] Unquote the track id (#2462) 2014-02-28 01:58:09 +08:00			`compat_urllib_parse,`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`)`
			`from ..utils import (`
Remove the calls to 'compat_urllib_request.urlopen' in a few extractors 2013-12-09 05:24:55 +08:00			`ExtractorError,`
[mixcloud] Use a HEAD request when checking if the url is valid 2014-08-26 20:55:15 +08:00			`HEADRequest,`
[mixcloud] Fix extraction (Closes #4784) 2015-01-31 01:21:44 +08:00			`str_to_int,`
Move MixCloud into its own file 2013-06-24 03:59:15 +08:00			`)`


			`class MixcloudIE(InfoExtractor):`
[mixcloud] Fix _VALID_RE (fixes #2462) Accept any character except `/` for uploader and the name, caused problems with non ASCII characters 2014-02-26 07:04:03 +08:00			`_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'`
[mixcloud] Use unicode_literals 2014-01-17 11:06:18 +08:00			`IE_NAME = 'mixcloud'`
Move MixCloud into its own file 2013-06-24 03:59:15 +08:00
[mixcloud] Fix extraction (Closes #4862) 2015-02-04 21:47:55 +08:00			`_TESTS = [{`
[mixcloud] Use unicode_literals 2014-01-17 11:06:18 +08:00			`'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',`
			`'info_dict': {`
[mixcloud] Unquote the track id (#2462) 2014-02-28 01:58:09 +08:00			`'id': 'dholbach-cryptkeeper',`
			`'ext': 'mp3',`
[mixcloud] Use unicode_literals 2014-01-17 11:06:18 +08:00			`'title': 'Cryptkeeper',`
			`'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',`
			`'uploader': 'Daniel Holbach',`
			`'uploader_id': 'dholbach',`
[mixcloud] Shed API dependency (#2904) 2014-05-13 15:42:38 +08:00			`'thumbnail': 're:https?://.*\.jpg',`
			`'view_count': int,`
			`'like_count': int,`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 20:26:42 +08:00			`},`
[mixcloud] Fix extraction (Closes #4862) 2015-02-04 21:47:55 +08:00			`}, {`
			`'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',`
			`'info_dict': {`
			`'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',`
[mixcloud] Fix extraction of some metadata The second test had some wrong info. I couldn't find the timestamp, so I have removed it. 2015-03-18 23:50:23 +08:00			`'ext': 'mp3',`
			`'title': 'Caribou 7 inch Vinyl Mix & Chat',`
[mixcloud] Fix extraction (Closes #4862) 2015-02-04 21:47:55 +08:00			`'description': 'md5:2b8aec6adce69f9d41724647c65875e8',`
[mixcloud] Fix extraction of some metadata The second test had some wrong info. I couldn't find the timestamp, so I have removed it. 2015-03-18 23:50:23 +08:00			`'uploader': 'Gilles Peterson Worldwide',`
[mixcloud] Fix extraction (Closes #4862) 2015-02-04 21:47:55 +08:00			`'uploader_id': 'gillespeterson',`
[mixcloud] Fix extraction of some metadata The second test had some wrong info. I couldn't find the timestamp, so I have removed it. 2015-03-18 23:50:23 +08:00			`'thumbnail': 're:https?://.*/images/',`
[mixcloud] Fix extraction (Closes #4862) 2015-02-04 21:47:55 +08:00			`'view_count': int,`
			`'like_count': int,`
			`},`
			`}]`
Move MixCloud into its own file 2013-06-24 03:59:15 +08:00
[mixcloud] Try preview server first, then further numbers 2015-03-16 07:20:06 +08:00			`def _get_url(self, track_id, template_url, server_number):`
			`boundaries = (1, 30)`
			`for nr in server_numbers(server_number, boundaries):`
			`url = template_url % nr`
Move MixCloud into its own file 2013-06-24 03:59:15 +08:00			`try:`
Remove the calls to 'compat_urllib_request.urlopen' in a few extractors 2013-12-09 05:24:55 +08:00			`# We only want to know if the request succeed`
			`# don't download the whole file`
[mixcloud] Output downloading progress 2014-10-15 06:53:54 +08:00			`self._request_webpage(`
			`HEADRequest(url), track_id,`
[mixcloud] Try preview server first, then further numbers 2015-03-16 07:20:06 +08:00			`'Checking URL %d/%d ...' % (nr, boundaries[-1]))`
Move MixCloud into its own file 2013-06-24 03:59:15 +08:00			`return url`
Remove the calls to 'compat_urllib_request.urlopen' in a few extractors 2013-12-09 05:24:55 +08:00			`except ExtractorError:`
[mixcloud] Output downloading progress 2014-10-15 06:53:54 +08:00			`pass`
Move MixCloud into its own file 2013-06-24 03:59:15 +08:00			`return None`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 20:26:42 +08:00			`uploader = mobj.group(1)`
			`cloudcast_name = mobj.group(2)`
[mixcloud] Unquote the track id (#2462) 2014-02-28 01:58:09 +08:00			`track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))`
[mixcloud] Fix URL extraction 2014-01-17 11:05:15 +08:00
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 20:26:42 +08:00			`webpage = self._download_webpage(url, track_id)`

[mixcloud] Fix URL extraction 2014-01-17 11:05:15 +08:00			`preview_url = self._search_regex(`
[mixcloud] Fix extraction (Closes #4862) 2015-02-04 21:47:55 +08:00			`r'\s(?:data-preview-url\|m-preview)="([^"]+)"', webpage, 'preview url')`
[mixcloud] Fix track url transformation (fixes #2068) ‘/previews/‘ must be replaced with ‘/c/originals/‘ now. 2014-01-02 04:07:55 +08:00			`song_url = preview_url.replace('/previews/', '/c/originals/')`
[mixcloud] Try preview server first, then further numbers 2015-03-16 07:20:06 +08:00			`server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number'))`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 20:26:42 +08:00			`template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)`
[mixcloud] Try preview server first, then further numbers 2015-03-16 07:20:06 +08:00			`final_song_url = self._get_url(track_id, template_url, server_number)`
[mixcloud] Try to get the m4a url if the mp3 url fails to download (fixes #1939) 2013-12-10 20:42:41 +08:00			`if final_song_url is None:`
			`self.to_screen('Trying with m4a extension')`
			`template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')`
[mixcloud] Try preview server first, then further numbers 2015-03-16 07:20:06 +08:00			`final_song_url = self._get_url(track_id, template_url, server_number)`
[mixcloud] Try to get the m4a url if the mp3 url fails to download (fixes #1939) 2013-12-10 20:42:41 +08:00			`if final_song_url is None:`
[mixcloud] Shed API dependency (#2904) 2014-05-13 15:42:38 +08:00			`raise ExtractorError('Unable to extract track url')`

			`PREFIX = (`
[mixcloud] Fix extraction of some metadata The second test had some wrong info. I couldn't find the timestamp, so I have removed it. 2015-03-18 23:50:23 +08:00			`r'm-play-on-spacebar[^>]+'`
[mixcloud] Shed API dependency (#2904) 2014-05-13 15:42:38 +08:00			`r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')`
			`title = self._html_search_regex(`
			`PREFIX + r'm-title="([^"]+)"', webpage, 'title')`
			`thumbnail = self._proto_relative_url(self._html_search_regex(`
			`PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail',`
			`fatal=False))`
			`uploader = self._html_search_regex(`
			`PREFIX + r'm-owner-name="([^"]+)"',`
			`webpage, 'uploader', fatal=False)`
			`uploader_id = self._search_regex(`
			`r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)`
			`description = self._og_search_description(webpage)`
[mixcloud] Fix extraction (Closes #4784) 2015-01-31 01:21:44 +08:00			`like_count = str_to_int(self._search_regex(`
[mixcloud] Fix extraction of like count (reported in #5231) 2015-04-04 01:37:35 +08:00			`r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',`
[mixcloud] Shed API dependency (#2904) 2014-05-13 15:42:38 +08:00			`webpage, 'like count', fatal=False))`
[mixcloud] Fix extraction (Closes #4784) 2015-01-31 01:21:44 +08:00			`view_count = str_to_int(self._search_regex(`
			`[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',`
			`r'/listeners/?">([0-9,.]+)</a>'],`
[mixcloud] Shed API dependency (#2904) 2014-05-13 15:42:38 +08:00			`webpage, 'play count', fatal=False))`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 20:26:42 +08:00
			`return {`
			`'id': track_id,`
[mixcloud] Shed API dependency (#2904) 2014-05-13 15:42:38 +08:00			`'title': title,`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 20:26:42 +08:00			`'url': final_song_url,`
[mixcloud] Shed API dependency (#2904) 2014-05-13 15:42:38 +08:00			`'description': description,`
			`'thumbnail': thumbnail,`
			`'uploader': uploader,`
			`'uploader_id': uploader_id,`
			`'view_count': view_count,`
			`'like_count': like_count,`
[mixcloud] Rewrite extractor (fixes #278) 2013-09-14 20:26:42 +08:00			`}`
[mixcloud] Try preview server first, then further numbers 2015-03-16 07:20:06 +08:00

			`def server_numbers(first, boundaries):`
			`""" Server numbers to try in descending order of probable availability.`
			`Starting from first (i.e. the number of the server hosting the preview file)`
			`and going further and further up to the higher boundary and down to the`
			`lower one in an alternating fashion. Namely:`

			`server_numbers(2, (1, 5))`

			`# Where the preview server is 2, min number is 1 and max is 5.`
			`# Yields: 2, 3, 1, 4, 5`

			`Why not random numbers or increasing sequences? Since from what I've seen,`
			`full length files seem to be hosted on servers whose number is closer to`
			`that of the preview; to be confirmed.`
			`"""`
			`zip_longest = getattr(itertools, 'zip_longest', None)`
			`if zip_longest is None:`
			`# python 2.x`
			`zip_longest = itertools.izip_longest`

			`if len(boundaries) != 2:`
			`raise ValueError("boundaries should be a two-element tuple")`
			`min, max = boundaries`
			`highs = range(first + 1, max + 1)`
			`lows = range(first - 1, min - 1, -1)`
			`rest = filter(`
			`None, itertools.chain.from_iterable(zip_longest(highs, lows)))`
			`yield first`
			`for n in rest:`
			`yield n`