youtube-dl/youtube_dl/extractor/vevo.py

from __future__ import unicode_literals

import re
import xml.etree.ElementTree
import datetime

from .common import InfoExtractor
from ..utils import (
    compat_HTTPError,
    ExtractorError,
)


class VevoIE(InfoExtractor):
    """
    Accepts urls from vevo.com or in the format 'vevo:{id}'
    (currently used by MTVIE)
    """
    _VALID_URL = r'''(?x)
        (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?|
           https?://cache\.vevo\.com/m/html/embed\.html\?video=|
           https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
           vevo:)
        (?P<id>[^&?#]+)'''
    _TESTS = [{
        'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
        'file': 'GB1101300280.mp4',
        "md5": "06bea460acb744eab74a9d7dcb4bfd61",
        'info_dict': {
            "upload_date": "20130624",
            "uploader": "Hurts",
            "title": "Somebody to Die For",
            "duration": 230.12,
            "width": 1920,
            "height": 1080,
        }
    }]
    _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'

    def _formats_from_json(self, video_info):
        last_version = {'version': -1}
        for version in video_info['videoVersions']:
            # These are the HTTP downloads, other types are for different manifests
            if version['sourceType'] == 2:
                if version['version'] > last_version['version']:
                    last_version = version
        if last_version['version'] == -1:
            raise ExtractorError('Unable to extract last version of the video')

        renditions = xml.etree.ElementTree.fromstring(last_version['data'])
        formats = []
        # Already sorted from worst to best quality
        for rend in renditions.findall('rendition'):
            attr = rend.attrib
            format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr
            formats.append({
                'url': attr['url'],
                'format_id': attr['name'],
                'format_note': format_note,
                'height': int(attr['frameheight']),
                'width': int(attr['frameWidth']),
            })
        return formats

    def _formats_from_smil(self, smil_xml):
        formats = []
        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
        els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
        for el in els:
            src = el.attrib['src']
            m = re.match(r'''(?xi)
                (?P<ext>[a-z0-9]+):
                (?P<path>
                    [/a-z0-9]+     # The directory and main part of the URL
                    _(?P<cbr>[0-9]+)k
                    _(?P<width>[0-9]+)x(?P<height>[0-9]+)
                    _(?P<vcodec>[a-z0-9]+)
                    _(?P<vbr>[0-9]+)
                    _(?P<acodec>[a-z0-9]+)
                    _(?P<abr>[0-9]+)
                    \.[a-z0-9]+  # File extension
                )''', src)
            if not m:
                continue

            format_url = self._SMIL_BASE_URL + m.group('path')
            formats.append({
                'url': format_url,
                'format_id': 'SMIL_' + m.group('cbr'),
                'vcodec': m.group('vcodec'),
                'acodec': m.group('acodec'),
                'vbr': int(m.group('vbr')),
                'abr': int(m.group('abr')),
                'ext': m.group('ext'),
                'width': int(m.group('width')),
                'height': int(m.group('height')),
            })
        return formats

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
        video_info = self._download_json(json_url, video_id)['video']

        formats = self._formats_from_json(video_info)
        try:
            smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
                self._SMIL_BASE_URL, video_id, video_id.lower())
            smil_xml = self._download_webpage(smil_url, video_id,
                                              'Downloading SMIL info')
            formats.extend(self._formats_from_smil(smil_xml))
        except ExtractorError as ee:
            if not isinstance(ee.cause, compat_HTTPError):
                raise
            self._downloader.report_warning(
                'Cannot download SMIL information, falling back to JSON ..')

        timestamp_ms = int(self._search_regex(
            r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))
        upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
        return {
            'id': video_id,
            'title': video_info['title'],
            'formats': formats,
            'thumbnail': video_info['imageUrl'],
            'upload_date': upload_date.strftime('%Y%m%d'),
            'uploader': video_info['mainArtists'][0]['artistName'],
            'duration': video_info['duration'],
        }
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`from __future__ import unicode_literals`

Add VevoIE 2013-06-24 18:31:41 +08:00			`import re`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`import xml.etree.ElementTree`
			`import datetime`
Add VevoIE 2013-06-24 18:31:41 +08:00
			`from .common import InfoExtractor`
			`from ..utils import (`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`compat_HTTPError,`
Add VevoIE 2013-06-24 18:31:41 +08:00			`ExtractorError,`
			`)`

[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00
Add VevoIE 2013-06-24 18:31:41 +08:00			`class VevoIE(InfoExtractor):`
MTVIE: add support for Vevo videos (related #913) 2013-06-24 19:54:19 +08:00			`"""`
[vevo] fix testcase 2013-08-11 13:12:38 +08:00			`Accepts urls from vevo.com or in the format 'vevo:{id}'`
MTVIE: add support for Vevo videos (related #913) 2013-06-24 19:54:19 +08:00			`(currently used by MTVIE)`
			`"""`
Add support for embedded vevo player (Fixes #1957) 2013-12-17 04:45:21 +08:00			`_VALID_URL = r'''(?x)`
			`(?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?\|`
			`https?://cache\.vevo\.com/m/html/embed\.html\?video=\|`
[vevo] Add suppor for videoplayer. URLs (#1957) 2013-12-17 04:48:38 +08:00			`https?://videoplayer\.vevo\.com/embed/embedded\?videoId=\|`
Add support for embedded vevo player (Fixes #1957) 2013-12-17 04:45:21 +08:00			`vevo:)`
			`(?P<id>[^&?#]+)'''`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`_TESTS = [{`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',`
			`'file': 'GB1101300280.mp4',`
			`"md5": "06bea460acb744eab74a9d7dcb4bfd61",`
			`'info_dict': {`
			`"upload_date": "20130624",`
			`"uploader": "Hurts",`
			`"title": "Somebody to Die For",`
			`"duration": 230.12,`
			`"width": 1920,`
			`"height": 1080,`
Move tests to the IE definitions 2013-06-28 02:46:46 +08:00			`}`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`}]`
			`_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'`
Add VevoIE 2013-06-24 18:31:41 +08:00
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`def _formats_from_json(self, video_info):`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`last_version = {'version': -1}`
			`for version in video_info['videoVersions']:`
			`# These are the HTTP downloads, other types are for different manifests`
			`if version['sourceType'] == 2:`
			`if version['version'] > last_version['version']:`
			`last_version = version`
			`if last_version['version'] == -1:`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`raise ExtractorError('Unable to extract last version of the video')`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00
			`renditions = xml.etree.ElementTree.fromstring(last_version['data'])`
			`formats = []`
			`# Already sorted from worst to best quality`
			`for rend in renditions.findall('rendition'):`
			`attr = rend.attrib`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`formats.append({`
[vevo] Add more format details 2013-10-29 22:10:09 +08:00			`'url': attr['url'],`
			`'format_id': attr['name'],`
			`'format_note': format_note,`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`'height': int(attr['frameheight']),`
			`'width': int(attr['frameWidth']),`
			`})`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`return formats`

			`def _formats_from_smil(self, smil_xml):`
			`formats = []`
			`smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))`
			`els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')`
			`for el in els:`
			`src = el.attrib['src']`
			`m = re.match(r'''(?xi)`
			`(?P<ext>[a-z0-9]+):`
			`(?P<path>`
			`[/a-z0-9]+ # The directory and main part of the URL`
			`_(?P<cbr>[0-9]+)k`
			`_(?P<width>[0-9]+)x(?P<height>[0-9]+)`
			`_(?P<vcodec>[a-z0-9]+)`
			`_(?P<vbr>[0-9]+)`
			`_(?P<acodec>[a-z0-9]+)`
			`_(?P<abr>[0-9]+)`
			`\.[a-z0-9]+ # File extension`
			`)''', src)`
			`if not m:`
			`continue`

			`format_url = self._SMIL_BASE_URL + m.group('path')`
			`formats.append({`
			`'url': format_url,`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`'format_id': 'SMIL_' + m.group('cbr'),`
Add automatic generation of format note based on bitrate and codecs 2013-11-16 08:08:43 +08:00			`'vcodec': m.group('vcodec'),`
			`'acodec': m.group('acodec'),`
			`'vbr': int(m.group('vbr')),`
			`'abr': int(m.group('abr')),`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`'ext': m.group('ext'),`
			`'width': int(m.group('width')),`
			`'height': int(m.group('height')),`
			`})`
			`return formats`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`video_id = mobj.group('id')`

			`json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`video_info = self._download_json(json_url, video_id)['video']`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00
			`formats = self._formats_from_json(video_info)`
			`try:`
			`smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (`
			`self._SMIL_BASE_URL, video_id, video_id.lower())`
			`smil_xml = self._download_webpage(smil_url, video_id,`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`'Downloading SMIL info')`
[vevo] Readd support for SMIL (Fixes #1683) 2013-10-30 08:14:17 +08:00			`formats.extend(self._formats_from_smil(smil_xml))`
			`except ExtractorError as ee:`
			`if not isinstance(ee.cause, compat_HTTPError):`
			`raise`
			`self._downloader.report_warning(`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`'Cannot download SMIL information, falling back to JSON ..')`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00
[vevo] Fix timestamp handling ( / 1000 is implicit float division ) 2013-10-29 21:00:01 +08:00			`timestamp_ms = int(self._search_regex(`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))`
[vevo] Fix timestamp handling ( / 1000 is implicit float division ) 2013-10-29 21:00:01 +08:00			`upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)`
[vevo] Simplify and use unicode_literals 2014-01-31 20:56:45 +08:00			`return {`
[vevo] Some improvements (fixes #1580) Extract the info from http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc={id} Some videos don't have an smil manifest, extract the video urls directly from the json and use the last version of the video. Extract all the available formats and set the 'formats' field of the result 2013-10-09 03:23:55 +08:00			`'id': video_id,`
			`'title': video_info['title'],`
			`'formats': formats,`
			`'thumbnail': video_info['imageUrl'],`
			`'upload_date': upload_date.strftime('%Y%m%d'),`
			`'uploader': video_info['mainArtists'][0]['artistName'],`
			`'duration': video_info['duration'],`
			`}`