youtube-dl/youtube_dl/extractor/vevo.py

from __future__ import unicode_literals

import re
import xml.etree.ElementTree

from .common import InfoExtractor
from ..compat import (
    compat_urllib_request,
)
from ..utils import (
    ExtractorError,
)


class VevoIE(InfoExtractor):
    """
    Accepts urls from vevo.com or in the format 'vevo:{id}'
    (currently used by MTVIE and MySpaceIE)
    """
    _VALID_URL = r'''(?x)
        (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?|
           https?://cache\.vevo\.com/m/html/embed\.html\?video=|
           https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
           vevo:)
        (?P<id>[^&?#]+)'''

    _TESTS = [{
        'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
        "md5": "95ee28ee45e70130e3ab02b0f579ae23",
        'info_dict': {
            'id': 'GB1101300280',
            'ext': 'mp4',
            "upload_date": "20130624",
            "uploader": "Hurts",
            "title": "Somebody to Die For",
            "duration": 230.12,
            "width": 1920,
            "height": 1080,
            # timestamp and upload_date are often incorrect; seem to change randomly
            'timestamp': int,
        }
    }, {
        'note': 'v3 SMIL format',
        'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
        'md5': 'f6ab09b034f8c22969020b042e5ac7fc',
        'info_dict': {
            'id': 'USUV71302923',
            'ext': 'mp4',
            'upload_date': '20140219',
            'uploader': 'Cassadee Pope',
            'title': 'I Wish I Could Break Your Heart',
            'duration': 226.101,
            'age_limit': 0,
            'timestamp': int,
        }
    }, {
        'note': 'Age-limited video',
        'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
        'info_dict': {
            'id': 'USRV81300282',
            'ext': 'mp4',
            'age_limit': 18,
            'title': 'Tunnel Vision (Explicit)',
            'uploader': 'Justin Timberlake',
            'upload_date': 're:2013070[34]',
            'timestamp': int,
        },
        'params': {
            'skip_download': 'true',
        }
    }]
    _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'

    def _real_initialize(self):
        req = compat_urllib_request.Request(
            'http://www.vevo.com/auth', data=b'')
        webpage = self._download_webpage(
            req, None,
            note='Retrieving oauth token',
            errnote='Unable to retrieve oauth token',
            fatal=False)
        if webpage is False:
            self._oauth_token = None
        else:
            self._oauth_token = self._search_regex(
                r'access_token":\s*"([^"]+)"',
                webpage, 'access token', fatal=False)

    def _formats_from_json(self, video_info):
        last_version = {'version': -1}
        for version in video_info['videoVersions']:
            # These are the HTTP downloads, other types are for different manifests
            if version['sourceType'] == 2:
                if version['version'] > last_version['version']:
                    last_version = version
        if last_version['version'] == -1:
            raise ExtractorError('Unable to extract last version of the video')

        renditions = xml.etree.ElementTree.fromstring(last_version['data'])
        formats = []
        # Already sorted from worst to best quality
        for rend in renditions.findall('rendition'):
            attr = rend.attrib
            format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr
            formats.append({
                'url': attr['url'],
                'format_id': attr['name'],
                'format_note': format_note,
                'height': int(attr['frameheight']),
                'width': int(attr['frameWidth']),
            })
        return formats

    def _formats_from_smil(self, smil_xml):
        formats = []
        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
        els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
        for el in els:
            src = el.attrib['src']
            m = re.match(r'''(?xi)
                (?P<ext>[a-z0-9]+):
                (?P<path>
                    [/a-z0-9]+     # The directory and main part of the URL
                    _(?P<cbr>[0-9]+)k
                    _(?P<width>[0-9]+)x(?P<height>[0-9]+)
                    _(?P<vcodec>[a-z0-9]+)
                    _(?P<vbr>[0-9]+)
                    _(?P<acodec>[a-z0-9]+)
                    _(?P<abr>[0-9]+)
                    \.[a-z0-9]+  # File extension
                )''', src)
            if not m:
                continue

            format_url = self._SMIL_BASE_URL + m.group('path')
            formats.append({
                'url': format_url,
                'format_id': 'SMIL_' + m.group('cbr'),
                'vcodec': m.group('vcodec'),
                'acodec': m.group('acodec'),
                'vbr': int(m.group('vbr')),
                'abr': int(m.group('abr')),
                'ext': m.group('ext'),
                'width': int(m.group('width')),
                'height': int(m.group('height')),
            })
        return formats

    def _download_api_formats(self, video_id):
        if not self._oauth_token:
            self._downloader.report_warning(
                'No oauth token available, skipping API HLS download')
            return []

        api_url = 'https://apiv2.vevo.com/video/%s/streams/hls?token=%s' % (
            video_id, self._oauth_token)
        api_data = self._download_json(
            api_url, video_id,
            note='Downloading HLS formats',
            errnote='Failed to download HLS format list', fatal=False)
        if api_data is None:
            return []

        m3u8_url = api_data[0]['url']
        return self._extract_m3u8_formats(
            m3u8_url, video_id, entry_protocol='m3u8_native', ext='mp4',
            preference=0)

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
        response = self._download_json(json_url, video_id)
        video_info = response['video']

        if not video_info:
            if 'statusMessage' in response:
                raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True)
            raise ExtractorError('Unable to extract videos')

        formats = self._formats_from_json(video_info)

        is_explicit = video_info.get('isExplicit')
        if is_explicit is True:
            age_limit = 18
        elif is_explicit is False:
            age_limit = 0
        else:
            age_limit = None

        # Download via HLS API
        formats.extend(self._download_api_formats(video_id))

        self._sort_formats(formats)
        timestamp_ms = int(self._search_regex(
            r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))

        return {
            'id': video_id,
            'title': video_info['title'],
            'formats': formats,
            'thumbnail': video_info['imageUrl'],
            'timestamp': timestamp_ms // 1000,
            'uploader': video_info['mainArtists'][0]['artistName'],
            'duration': video_info['duration'],
            'age_limit': age_limit,
        }
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`from __future__ import unicode_literals`

Add VevoIE 2013-06-24 18:31:41 +08:00			`import re`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`import xml.etree.ElementTree`
Add VevoIE 2013-06-24 18:31:41 +08:00
			`from .common import InfoExtractor`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`from ..compat import (`
[vevo] Support 1080p videos (Fixes #3656) 2014-09-24 20:16:56 +08:00			`compat_urllib_request,`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`)`
			`from ..utils import (`
Add VevoIE 2013-06-24 18:31:41 +08:00			`ExtractorError,`
			`)`

[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00
Add VevoIE 2013-06-24 18:31:41 +08:00			`class VevoIE(InfoExtractor):`
MTVIE: add support for Vevo videos (related #913) 2013-06-24 19:54:19 +08:00			`"""`
[vevo] fix testcase 2013-08-11 13:12:38 +08:00			`Accepts urls from vevo.com or in the format 'vevo:{id}'`
[myspace] Redirect to other extractors There are many songs just linked from Vevo/YouTube to MySpace. Vevo example: https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041 YouTube example: https://myspace.com/starset2/music/song/first-light-95799905-106964426 2014-12-01 03:00:16 +08:00			`(currently used by MTVIE and MySpaceIE)`
MTVIE: add support for Vevo videos (related #913) 2013-06-24 19:54:19 +08:00			`"""`
Add support for embedded vevo player (Fixes #1957) 2013-12-17 04:45:21 +08:00			`_VALID_URL = r'''(?x)`
[vevo] The title in the url is optional (fixes #3020) 2014-05-31 23:55:03 +08:00			`(?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?\|`
Add support for embedded vevo player (Fixes #1957) 2013-12-17 04:45:21 +08:00			`https?://cache\.vevo\.com/m/html/embed\.html\?video=\|`
[vevo] Add suppor for videoplayer. URLs (#1957) 2013-12-17 04:48:38 +08:00			`https?://videoplayer\.vevo\.com/embed/embedded\?videoId=\|`
Add support for embedded vevo player (Fixes #1957) 2013-12-17 04:45:21 +08:00			`vevo:)`
			`(?P<id>[^&?#]+)'''`
[vevo] Mark all test timestamps as approximate 2014-03-16 14:05:48 +08:00
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`_TESTS = [{`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',`
[hlsnative] Support test parameter 2014-09-24 20:38:40 +08:00			`"md5": "95ee28ee45e70130e3ab02b0f579ae23",`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`'info_dict': {`
[vevo] Add suppot for v3 SMIL URLs (Fixes #2409) 2014-02-25 18:06:47 +08:00			`'id': 'GB1101300280',`
			`'ext': 'mp4',`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`"upload_date": "20130624",`
			`"uploader": "Hurts",`
			`"title": "Somebody to Die For",`
			`"duration": 230.12,`
			`"width": 1920,`
			`"height": 1080,`
[vevo] Mark all test timestamps as approximate 2014-03-16 14:05:48 +08:00			`# timestamp and upload_date are often incorrect; seem to change randomly`
			`'timestamp': int,`
Move tests to the IE definitions 2013-06-28 02:46:46 +08:00			`}`
[vevo] Add suppot for v3 SMIL URLs (Fixes #2409) 2014-02-25 18:06:47 +08:00			`}, {`
			`'note': 'v3 SMIL format',`
			`'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',`
[hlsnative] Support test parameter 2014-09-24 20:38:40 +08:00			`'md5': 'f6ab09b034f8c22969020b042e5ac7fc',`
[vevo] Add suppot for v3 SMIL URLs (Fixes #2409) 2014-02-25 18:06:47 +08:00			`'info_dict': {`
			`'id': 'USUV71302923',`
			`'ext': 'mp4',`
			`'upload_date': '20140219',`
			`'uploader': 'Cassadee Pope',`
			`'title': 'I Wish I Could Break Your Heart',`
			`'duration': 226.101,`
[vevo] Add age_limit support 2014-02-25 18:15:34 +08:00			`'age_limit': 0,`
[vevo] Mark all test timestamps as approximate 2014-03-16 14:05:48 +08:00			`'timestamp': int,`
[vevo] Add age_limit support 2014-02-25 18:15:34 +08:00			`}`
			`}, {`
			`'note': 'Age-limited video',`
			`'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',`
			`'info_dict': {`
			`'id': 'USRV81300282',`
			`'ext': 'mp4',`
			`'age_limit': 18,`
			`'title': 'Tunnel Vision (Explicit)',`
			`'uploader': 'Justin Timberlake',`
[vevo] Adapt test to constantly changing timestamp 2014-03-14 01:44:55 +08:00			`'upload_date': 're:2013070[34]',`
			`'timestamp': int,`
[vevo] Add age_limit support 2014-02-25 18:15:34 +08:00			`},`
			`'params': {`
			`'skip_download': 'true',`
[vevo] Add suppot for v3 SMIL URLs (Fixes #2409) 2014-02-25 18:06:47 +08:00			`}`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`}]`
			`_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'`
Add VevoIE 2013-06-24 18:31:41 +08:00
[vevo] Support 1080p videos (Fixes #3656) 2014-09-24 20:16:56 +08:00			`def _real_initialize(self):`
			`req = compat_urllib_request.Request(`
			`'http://www.vevo.com/auth', data=b'')`
			`webpage = self._download_webpage(`
			`req, None,`
			`note='Retrieving oauth token',`
			`errnote='Unable to retrieve oauth token',`
			`fatal=False)`
			`if webpage is False:`
			`self._oauth_token = None`
			`else:`
			`self._oauth_token = self._search_regex(`
			`r'access_token":\s*"([^"]+)"',`
			`webpage, 'access token', fatal=False)`

[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`def _formats_from_json(self, video_info):`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`last_version = {'version': -1}`
			`for version in video_info['videoVersions']:`
			`# These are the HTTP downloads, other types are for different manifests`
			`if version['sourceType'] == 2:`
			`if version['version'] > last_version['version']:`
			`last_version = version`
			`if last_version['version'] == -1:`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`raise ExtractorError('Unable to extract last version of the video')`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00
			`renditions = xml.etree.ElementTree.fromstring(last_version['data'])`
			`formats = []`
			`# Already sorted from worst to best quality`
			`for rend in renditions.findall('rendition'):`
			`attr = rend.attrib`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`formats.append({`
[vevo] Add more format details 2013-10-29 22:10:09 +08:00			`'url': attr['url'],`
			`'format_id': attr['name'],`
			`'format_note': format_note,`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`'height': int(attr['frameheight']),`
			`'width': int(attr['frameWidth']),`
			`})`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`return formats`

			`def _formats_from_smil(self, smil_xml):`
			`formats = []`
			`smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))`
			`els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')`
			`for el in els:`
			`src = el.attrib['src']`
			`m = re.match(r'''(?xi)`
			`(?P<ext>[a-z0-9]+):`
			`(?P<path>`
			`[/a-z0-9]+ # The directory and main part of the URL`
			`_(?P<cbr>[0-9]+)k`
			`_(?P<width>[0-9]+)x(?P<height>[0-9]+)`
			`_(?P<vcodec>[a-z0-9]+)`
			`_(?P<vbr>[0-9]+)`
			`_(?P<acodec>[a-z0-9]+)`
			`_(?P<abr>[0-9]+)`
			`\.[a-z0-9]+ # File extension`
			`)''', src)`
			`if not m:`
			`continue`

			`format_url = self._SMIL_BASE_URL + m.group('path')`
			`formats.append({`
			`'url': format_url,`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`'format_id': 'SMIL_' + m.group('cbr'),`
Add automatic generation of format note based on bitrate and codecs 2013-11-16 08:08:43 +08:00			`'vcodec': m.group('vcodec'),`
			`'acodec': m.group('acodec'),`
			`'vbr': int(m.group('vbr')),`
			`'abr': int(m.group('abr')),`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`'ext': m.group('ext'),`
			`'width': int(m.group('width')),`
			`'height': int(m.group('height')),`
			`})`
			`return formats`

[vevo] Support 1080p videos (Fixes #3656) 2014-09-24 20:16:56 +08:00			`def _download_api_formats(self, video_id):`
			`if not self._oauth_token:`
			`self._downloader.report_warning(`
			`'No oauth token available, skipping API HLS download')`
			`return []`

			`api_url = 'https://apiv2.vevo.com/video/%s/streams/hls?token=%s' % (`
			`video_id, self._oauth_token)`
			`api_data = self._download_json(`
			`api_url, video_id,`
			`note='Downloading HLS formats',`
			`errnote='Failed to download HLS format list', fatal=False)`
			`if api_data is None:`
			`return []`

			`m3u8_url = api_data[0]['url']`
			`return self._extract_m3u8_formats(`
			`m3u8_url, video_id, entry_protocol='m3u8_native', ext='mp4',`
			`preference=0)`

[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`video_id = mobj.group('id')`

			`json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id`
[vevo] Add friendly error output (#2874) 2014-05-10 05:34:53 +08:00			`response = self._download_json(json_url, video_id)`
			`video_info = response['video']`

			`if not video_info:`
			`if 'statusMessage' in response:`
			`raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True)`
			`raise ExtractorError('Unable to extract videos')`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00
			`formats = self._formats_from_json(video_info)`
[vevo] Add suppot for v3 SMIL URLs (Fixes #2409) 2014-02-25 18:06:47 +08:00
[vevo] Add age_limit support 2014-02-25 18:15:34 +08:00			`is_explicit = video_info.get('isExplicit')`
			`if is_explicit is True:`
			`age_limit = 18`
			`elif is_explicit is False:`
			`age_limit = 0`
			`else:`
			`age_limit = None`

[vevo] Support 1080p videos (Fixes #3656) 2014-09-24 20:16:56 +08:00			`# Download via HLS API`
			`formats.extend(self._download_api_formats(video_id))`

[vevo] Sort formats (Fixes #3399) 2014-07-30 15:49:55 +08:00			`self._sort_formats(formats)`
[vevo] Fix timestamp handling ( / 1000 is implicit float division ) 2013-10-29 21:00:01 +08:00			`timestamp_ms = int(self._search_regex(`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))`
[vevo] Centralize timestamp handling 2014-03-13 22:30:25 +08:00
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`return {`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`'id': video_id,`
			`'title': video_info['title'],`
			`'formats': formats,`
			`'thumbnail': video_info['imageUrl'],`
Rename upload_timestamp to timestamp 2014-03-14 01:21:55 +08:00			`'timestamp': timestamp_ms // 1000,`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`'uploader': video_info['mainArtists'][0]['artistName'],`
			`'duration': video_info['duration'],`
[vevo] Add age_limit support 2014-02-25 18:15:34 +08:00			`'age_limit': age_limit,`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`}`