youtube-dl/youtube_dl/extractor/ted.py

from __future__ import unicode_literals

import json
import re

from .subtitles import SubtitlesInfoExtractor

from ..utils import (
    compat_str,
)


class TEDIE(SubtitlesInfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
        (
            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
            |
            ((?P<type_talk>talks)) # We have a simple talk
            |
            (?P<type_watch>watch)/[^/]+/[^/]+
        )
        (/lang/(.*?))? # The url may contain the language
        /(?P<name>[\w-]+) # Here goes the name and then ".html"
        .*)$
        '''
    _TESTS = [{
        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
        'info_dict': {
            'id': '102',
            'ext': 'mp4',
            'title': 'The illusion of consciousness',
            'description': ('Philosopher Dan Dennett makes a compelling '
                'argument that not only don\'t we understand our own '
                'consciousness, but that half the time our brains are '
                'actively fooling us.'),
            'uploader': 'Dan Dennett',
            'width': 854,
            'duration': 1308,
        }
    }, {
        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
        'md5': '226f4fb9c62380d11b7995efa4c87994',
        'info_dict': {
            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
            'ext': 'mp4',
            'title': 'Vishal Sikka: The beauty and power of algorithms',
            'thumbnail': 're:^https?://.+\.jpg',
            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
        }
    }, {
        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
        'info_dict': {
            'id': '1972',
            'ext': 'mp4',
            'title': 'Be passionate. Be courageous. Be your best.',
            'uploader': 'Gabby Giffords and Mark Kelly',
            'description': 'md5:5174aed4d0f16021b704120360f72b92',
            'duration': 1128,
        },
    }, {
        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
        'info_dict': {
            'id': '10',
            'title': 'Who are the hackers?',
        },
        'playlist_mincount': 6,
    }, {
        # contains a youtube video
        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
        'add_ie': ['Youtube'],
        'info_dict': {
            'id': '_ZG8HBuDjgc',
            'ext': 'mp4',
            'title': 'Douglas Adams: Parrots the Universe and Everything',
            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
            'uploader': 'University of California Television (UCTV)',
            'uploader_id': 'UCtelevision',
            'upload_date': '20080522',
        },
        'params': {
            'skip_download': True,
        },
    }]

    _NATIVE_FORMATS = {
        'low': {'preference': 1, 'width': 320, 'height': 180},
        'medium': {'preference': 2, 'width': 512, 'height': 288},
        'high': {'preference': 3, 'width': 854, 'height': 480},
    }

    def _extract_info(self, webpage):
        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
            webpage, 'info json')
        return json.loads(info_json)

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url, re.VERBOSE)
        if m.group('type') == 'embed':
            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
            return self.url_result(desktop_url, 'TED')
        name = m.group('name')
        if m.group('type_talk'):
            return self._talk_info(url, name)
        elif m.group('type_watch'):
            return self._watch_info(url, name)
        else:
            return self._playlist_videos_info(url, name)

    def _playlist_videos_info(self, url, name):
        '''Returns the videos of the playlist'''

        webpage = self._download_webpage(url, name,
            'Downloading playlist webpage')
        info = self._extract_info(webpage)
        playlist_info = info['playlist']

        playlist_entries = [
            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
            for talk in info['talks']
        ]
        return self.playlist_result(
            playlist_entries,
            playlist_id=compat_str(playlist_info['id']),
            playlist_title=playlist_info['title'])

    def _talk_info(self, url, video_name):
        webpage = self._download_webpage(url, video_name)
        self.report_extraction(video_name)

        talk_info = self._extract_info(webpage)['talks'][0]

        if talk_info.get('external') is not None:
            self.to_screen('Found video from %s' % talk_info['external']['service'])
            return {
                '_type': 'url',
                'url': talk_info['external']['uri'],
            }

        formats = [{
            'url': format_url,
            'format_id': format_id,
            'format': format_id,
        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
        if formats:
            for f in formats:
                finfo = self._NATIVE_FORMATS.get(f['format_id'])
                if finfo:
                    f.update(finfo)
        else:
            # Use rtmp downloads
            formats = [{
                'format_id': f['name'],
                'url': talk_info['streamer'],
                'play_path': f['file'],
                'ext': 'flv',
                'width': f['width'],
                'height': f['height'],
                'tbr': f['bitrate'],
            } for f in talk_info['resources']['rtmp']]
        self._sort_formats(formats)

        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return

        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
            thumbnail = 'http://' + thumbnail
        return {
            'id': video_id,
            'title': talk_info['title'].strip(),
            'uploader': talk_info['speaker'],
            'thumbnail': thumbnail,
            'description': self._og_search_description(webpage),
            'subtitles': video_subtitles,
            'formats': formats,
            'duration': talk_info.get('duration'),
        }

    def _get_available_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
            return {}

    def _watch_info(self, url, name):
        webpage = self._download_webpage(url, name)

        config_json = self._html_search_regex(
            r"data-config='([^']+)", webpage, 'config')
        config = json.loads(config_json)
        video_url = config['video']['url']
        thumbnail = config.get('image', {}).get('url')

        title = self._html_search_regex(
            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
        description = self._html_search_regex(
            [
                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
            ],
            webpage, 'description', fatal=False)

        return {
            'id': name,
            'url': video_url,
            'title': title,
            'thumbnail': thumbnail,
            'description': description,
        }
[ted] Use unicode_literals 2014-01-17 10:52:17 +08:00			`from __future__ import unicode_literals`

Move TED IE into its own file 2013-06-24 03:55:53 +08:00			`import json`
			`import re`

[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00			`from .subtitles import SubtitlesInfoExtractor`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00
[ted] fixed error in case of no subtitles present I created a test, but I leave it commented since TED videos get new subtitles frequently. 2013-11-05 19:00:13 +08:00			`from ..utils import (`
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`compat_str,`
[ted] fixed error in case of no subtitles present I created a test, but I leave it commented since TED videos get new subtitles frequently. 2013-11-05 19:00:13 +08:00			`)`

[ted] Use unicode_literals 2014-01-17 10:52:17 +08:00
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00			`class TEDIE(SubtitlesInfoExtractor):`
[ted] Simplify embed code (#2587) 2014-03-20 23:33:23 +08:00			`_VALID_URL = r'''(?x)`
			`(?P<proto>https?://)`
			`(?P<type>www\|embed)(?P<urlmain>\.ted\.com/`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`(`
			`(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist`
			`\|`
			`((?P<type_talk>talks)) # We have a simple talk`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00			`\|`
			`(?P<type_watch>watch)/[^/]+/[^/]+`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`)`
			`(/lang/(.*?))? # The url may contain the language`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00			`/(?P<name>[\w-]+) # Here goes the name and then ".html"`
[ted] Simplify embed code (#2587) 2014-03-20 23:33:23 +08:00			`.*)$`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`'''`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00			`_TESTS = [{`
[ted] Use unicode_literals 2014-01-17 10:52:17 +08:00			`'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',`
[ted] Update test md5 2014-06-12 21:33:53 +08:00			`'md5': 'fc94ac279feebbce69f21c0c6ee82810',`
[ted] Use unicode_literals 2014-01-17 10:52:17 +08:00			`'info_dict': {`
[ted] Remove unused import and modernize test 2014-03-05 21:27:45 +08:00			`'id': '102',`
			`'ext': 'mp4',`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`'title': 'The illusion of consciousness',`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`'description': ('Philosopher Dan Dennett makes a compelling '`
			`'argument that not only don\'t we understand our own '`
			`'consciousness, but that half the time our brains are '`
			`'actively fooling us.'),`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`'uploader': 'Dan Dennett',`
[ted] Add width and height (Fixes #2716) 2014-04-07 19:07:07 +08:00			`'width': 854,`
[ted] Extract duration (closes #4155) 2014-11-12 16:30:57 +08:00			`'duration': 1308,`
Move tests to the IE definitions 2013-06-28 02:46:46 +08:00			`}`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00			`}, {`
			`'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'md5': '226f4fb9c62380d11b7995efa4c87994',`
			`'info_dict': {`
			`'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',`
			`'ext': 'mp4',`
			`'title': 'Vishal Sikka: The beauty and power of algorithms',`
			`'thumbnail': 're:^https?://.+\.jpg',`
			`'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',`
			`}`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 21:23:12 +08:00			`}, {`
			`'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',`
			`'info_dict': {`
			`'id': '1972',`
[ted] Update test 2014-04-22 20:49:41 +08:00			`'ext': 'mp4',`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 21:23:12 +08:00			`'title': 'Be passionate. Be courageous. Be your best.',`
			`'uploader': 'Gabby Giffords and Mark Kelly',`
[ted] Update test 2014-04-22 20:49:41 +08:00			`'description': 'md5:5174aed4d0f16021b704120360f72b92',`
[ted] Extract duration (closes #4155) 2014-11-12 16:30:57 +08:00			`'duration': 1128,`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 21:23:12 +08:00			`},`
Move playlist tests to extractors. From now on, test_download will run these tests. That means we benefit not only from the networking setup in there, but also from the other tests (for example test_all_urls to find problems with _VALID_URLs). 2014-08-28 06:58:24 +08:00			`}, {`
			`'url': 'http://www.ted.com/playlists/who_are_the_hackers',`
			`'info_dict': {`
			`'id': '10',`
			`'title': 'Who are the hackers?',`
			`},`
			`'playlist_mincount': 6,`
[ted] Add support for external videos (fixes #3948) 2014-10-15 18:24:11 +08:00			`}, {`
			`# contains a youtube video`
			`'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',`
			`'add_ie': ['Youtube'],`
			`'info_dict': {`
			`'id': '_ZG8HBuDjgc',`
			`'ext': 'mp4',`
			`'title': 'Douglas Adams: Parrots the Universe and Everything',`
			`'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',`
			`'uploader': 'University of California Television (UCTV)',`
			`'uploader_id': 'UCtelevision',`
			`'upload_date': '20080522',`
			`},`
			`'params': {`
			`'skip_download': True,`
			`},`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00			`}]`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00
[ted] Add width and height (Fixes #2716) 2014-04-07 19:07:07 +08:00			`_NATIVE_FORMATS = {`
			`'low': {'preference': 1, 'width': 320, 'height': 180},`
			`'medium': {'preference': 2, 'width': 512, 'height': 288},`
			`'high': {'preference': 3, 'width': 854, 'height': 480},`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`}`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`def _extract_info(self, webpage):`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',`
			`webpage, 'info json')`
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`return json.loads(info_json)`

Move TED IE into its own file 2013-06-24 03:55:53 +08:00			`def _real_extract(self, url):`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`m = re.match(self._VALID_URL, url, re.VERBOSE)`
[ted] Simplify embed code (#2587) 2014-03-20 23:33:23 +08:00			`if m.group('type') == 'embed':`
			`desktop_url = m.group('proto') + 'www' + m.group('urlmain')`
			`return self.url_result(desktop_url, 'TED')`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`name = m.group('name')`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00			`if m.group('type_talk'):`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`return self._talk_info(url, name)`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00			`elif m.group('type_watch'):`
			`return self._watch_info(url, name)`
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`else:`
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`return self._playlist_videos_info(url, name)`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`def _playlist_videos_info(self, url, name):`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00			`'''Returns the videos of the playlist'''`
[ted] Fix playlists (Fixes #1770) 2013-11-15 21:33:51 +08:00
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`webpage = self._download_webpage(url, name,`
			`'Downloading playlist webpage')`
			`info = self._extract_info(webpage)`
			`playlist_info = info['playlist']`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00
[ted] Fix playlists (Fixes #1770) 2013-11-15 21:33:51 +08:00			`playlist_entries = [`
[ted] Remove superfluous u prefixes 2014-04-21 18:34:32 +08:00			`self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())`
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`for talk in info['talks']`
[ted] Fix playlists (Fixes #1770) 2013-11-15 21:33:51 +08:00			`]`
			`return self.playlist_result(`
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`playlist_entries,`
			`playlist_id=compat_str(playlist_info['id']),`
			`playlist_title=playlist_info['title'])`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00
[ted] Style fixes 2014-03-05 20:27:26 +08:00			`def _talk_info(self, url, video_name):`
			`webpage = self._download_webpage(url, video_name)`
Move TED IE into its own file 2013-06-24 03:55:53 +08:00			`self.report_extraction(video_name)`
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00
[ted] Fix playlist extraction and add a test 2014-03-05 20:22:10 +08:00			`talk_info = self._extract_info(webpage)['talks'][0]`
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00
[ted] Add support for external videos (fixes #3948) 2014-10-15 18:24:11 +08:00			`if talk_info.get('external') is not None:`
			`self.to_screen('Found video from %s' % talk_info['external']['service'])`
			`return {`
			`'_type': 'url',`
			`'url': talk_info['external']['uri'],`
			`}`

[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`formats = [{`
			`'url': format_url,`
			`'format_id': format_id,`
			`'format': format_id,`
[ted] Use the rtmp links if there http downloads are not available. 2014-04-14 21:23:12 +08:00			`} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]`
			`if formats:`
			`for f in formats:`
			`finfo = self._NATIVE_FORMATS.get(f['format_id'])`
			`if finfo:`
			`f.update(finfo)`
			`else:`
			`# Use rtmp downloads`
			`formats = [{`
			`'format_id': f['name'],`
			`'url': talk_info['streamer'],`
			`'play_path': f['file'],`
			`'ext': 'flv',`
			`'width': f['width'],`
			`'height': f['height'],`
			`'tbr': f['bitrate'],`
			`} for f in talk_info['resources']['rtmp']]`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`self._sort_formats(formats)`

[ted] Remove unused import and modernize test 2014-03-05 21:27:45 +08:00			`video_id = compat_str(talk_info['id'])`
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00			`# subtitles`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`video_subtitles = self.extract_subtitles(video_id, talk_info)`
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00			`if self._downloader.params.get('listsubtitles', False):`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`self._list_available_subtitles(video_id, talk_info)`
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00			`return`

[ted] Add 'http://' to the thumbnail url if it's missing 2014-03-16 18:24:11 +08:00			`thumbnail = talk_info['thumb']`
			`if not thumbnail.startswith('http'):`
			`thumbnail = 'http://' + thumbnail`
[ted] simplify 2013-11-15 21:06:38 +08:00			`return {`
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00			`'id': video_id,`
[generic] Fix testcases 2014-09-29 11:12:57 +08:00			`'title': talk_info['title'].strip(),`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`'uploader': talk_info['speaker'],`
[ted] Add 'http://' to the thumbnail url if it's missing 2014-03-16 18:24:11 +08:00			`'thumbnail': thumbnail,`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`'description': self._og_search_description(webpage),`
[ted] Added support for subtitle download 2013-11-03 02:48:39 +08:00			`'subtitles': video_subtitles,`
[ted] Prepare #980 merge 2013-10-04 16:32:34 +08:00			`'formats': formats,`
[ted] Extract duration (closes #4155) 2014-11-12 16:30:57 +08:00			`'duration': talk_info.get('duration'),`
[ted] Prepare #980 merge 2013-10-04 16:32:34 +08:00			`}`

[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`def _get_available_subtitles(self, video_id, talk_info):`
			`languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]`
			`if languages:`
			`sub_lang_list = {}`
			`for l in languages:`
			`url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)`
			`sub_lang_list[l] = url`
			`return sub_lang_list`
			`else:`
[ted] Remove superfluous u prefixes 2014-04-21 18:34:32 +08:00			`self._downloader.report_warning('video doesn\'t have subtitles')`
[ted] Fix video extraction The site has been redesigned 2014-03-05 04:47:01 +08:00			`return {}`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00
			`def _watch_info(self, url, name):`
			`webpage = self._download_webpage(url, name)`

			`config_json = self._html_search_regex(`
			`r"data-config='([^']+)", webpage, 'config')`
			`config = json.loads(config_json)`
			`video_url = config['video']['url']`
			`thumbnail = config.get('image', {}).get('url')`

			`title = self._html_search_regex(`
			`r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')`
			`description = self._html_search_regex(`
[ted] Extend search for description 2014-04-21 18:37:16 +08:00			`[`
			`r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.?</h4>(.?)</div>',`
			`r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',`
			`],`
[ted] Add support for watch/ URLs (Fixes #2637) 2014-03-27 09:22:40 +08:00			`webpage, 'description', fatal=False)`

			`return {`
			`'id': name,`
			`'url': video_url,`
			`'title': title,`
			`'thumbnail': thumbnail,`
			`'description': description,`
			`}`