youtube-dl/youtube_dl/extractor/bloomberg.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor


class BloombergIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'

    _TESTS = [{
        'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
        # The md5 checksum changes
        'info_dict': {
            'id': 'qurhIVlJSB6hzkVi229d8g',
            'ext': 'flv',
            'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
            'description': 'md5:a8ba0302912d03d246979735c17d2761',
        },
        'params': {
            'format': 'best[format_id^=hds]',
        },
    }, {
        'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
        'only_matching': True,
    }, {
        'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        name = self._match_id(url)
        webpage = self._download_webpage(url, name)
        video_id = self._search_regex(
            r'["\']bmmrId["\']\s*:\s*(["\'])(?P<url>.+?)\1',
            webpage, 'id', group='url')
        title = re.sub(': Video$', '', self._og_search_title(webpage))

        embed_info = self._download_json(
            'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
        formats = []
        for stream in embed_info['streams']:
            stream_url = stream.get('url')
            if not stream_url:
                continue
            if stream['muxing_format'] == 'TS':
                formats.extend(self._extract_m3u8_formats(
                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
            else:
                formats.extend(self._extract_f4m_formats(
                    stream_url, video_id, f4m_id='hds', fatal=False))
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': self._og_search_description(webpage),
            'thumbnail': self._og_search_thumbnail(webpage),
        }
[bloomberg] Fix extraction (fixes #2154) Stop using the OoyalaIE, extract the f4m url instead. 2014-03-29 18:55:12 +08:00			`from __future__ import unicode_literals`

Add an extractor for Bloomberg (closes #1436) 2013-09-17 01:39:39 +08:00			`import re`

			`from .common import InfoExtractor`


			`class BloombergIE(InfoExtractor):`
[bloomberg] Relax _VALID_URL even more (Closes #7685) 2015-11-29 00:39:36 +08:00			`_VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'`
Add an extractor for Bloomberg (closes #1436) 2013-09-17 01:39:39 +08:00
[bloomberg] Reax _VALID_URL (Closes #7546) 2015-11-20 00:55:06 +08:00			`_TESTS = [{`
[bloomberg] Adapt to website changes (fixes #5347) 2015-04-03 21:01:17 +08:00			`'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',`
[bloomberg] Extract the available formats (closes #2776) It uses a helper method in the InfoExtractor class. The downloader will pick the requested formats using the bitrate in the info dict. 2014-07-28 21:25:56 +08:00			`# The md5 checksum changes`
[bloomberg] Fix extraction (fixes #2154) Stop using the OoyalaIE, extract the f4m url instead. 2014-03-29 18:55:12 +08:00			`'info_dict': {`
			`'id': 'qurhIVlJSB6hzkVi229d8g',`
			`'ext': 'flv',`
			`'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',`
[bloomberg] Adapt to website changes (fixes #5347) 2015-04-03 21:01:17 +08:00			`'description': 'md5:a8ba0302912d03d246979735c17d2761',`
Add an extractor for Bloomberg (closes #1436) 2013-09-17 01:39:39 +08:00			`},`
[bloomberg] Fix test_Bloomberg In this test case, sometimes HLS is the best format while sometimes HDS is. To prevent occasional test failures, force HDS to be the best format. In the past, testing against HDS formats causes the same error as #9214, which is fixed as #9377 landed. 2016-05-12 20:05:43 +08:00			`'params': {`
			`'format': 'best[format_id^=hds]',`
			`},`
[bloomberg] Reax _VALID_URL (Closes #7546) 2015-11-20 00:55:06 +08:00			`}, {`
			`'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',`
			`'only_matching': True,`
[bloomberg] Relax _VALID_URL even more (Closes #7685) 2015-11-29 00:39:36 +08:00			`}, {`
			`'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',`
			`'only_matching': True,`
[bloomberg] Reax _VALID_URL (Closes #7546) 2015-11-20 00:55:06 +08:00			`}]`
Add an extractor for Bloomberg (closes #1436) 2013-09-17 01:39:39 +08:00
			`def _real_extract(self, url):`
[bloomberg] Modernize 2015-02-24 18:08:00 +08:00			`name = self._match_id(url)`
Add an extractor for Bloomberg (closes #1436) 2013-09-17 01:39:39 +08:00			`webpage = self._download_webpage(url, name)`
[bloomberg] Improve video id regex 2015-11-29 00:41:39 +08:00			`video_id = self._search_regex(`
			`r'["\']bmmrId["\']\s:\s(["\'])(?P<url>.+?)\1',`
			`webpage, 'id', group='url')`
[bloomberg] Fix extraction (fixes #2154) Stop using the OoyalaIE, extract the f4m url instead. 2014-03-29 18:55:12 +08:00			`title = re.sub(': Video$', '', self._og_search_title(webpage))`

[bloomberg] Adapt to website changes (fixes #5347) 2015-04-03 21:01:17 +08:00			`embed_info = self._download_json(`
			`'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)`
			`formats = []`
			`for stream in embed_info['streams']:`
[bloomberg] Improve formats extraction 2015-11-29 00:45:19 +08:00			`stream_url = stream.get('url')`
			`if not stream_url:`
			`continue`
[bloomberg] Modernize 2015-11-29 00:40:29 +08:00			`if stream['muxing_format'] == 'TS':`
Simplify formats accumulation for f4m/m3u8/smil formats Now all _extract_*_formats routines return a list 2015-12-29 02:58:24 +08:00			`formats.extend(self._extract_m3u8_formats(`
			`stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))`
[bloomberg] Adapt to website changes (fixes #5347) 2015-04-03 21:01:17 +08:00			`else:`
Simplify formats accumulation for f4m/m3u8/smil formats Now all _extract_*_formats routines return a list 2015-12-29 02:58:24 +08:00			`formats.extend(self._extract_f4m_formats(`
			`stream_url, video_id, f4m_id='hds', fatal=False))`
[bloomberg] Adapt to website changes (fixes #5347) 2015-04-03 21:01:17 +08:00			`self._sort_formats(formats)`

[bloomberg] Fix extraction (fixes #2154) Stop using the OoyalaIE, extract the f4m url instead. 2014-03-29 18:55:12 +08:00			`return {`
[bloomberg] Adapt to website changes (fixes #5347) 2015-04-03 21:01:17 +08:00			`'id': video_id,`
[bloomberg] Fix extraction (fixes #2154) Stop using the OoyalaIE, extract the f4m url instead. 2014-03-29 18:55:12 +08:00			`'title': title,`
[bloomberg] Adapt to website changes (fixes #5347) 2015-04-03 21:01:17 +08:00			`'formats': formats,`
[bloomberg] Fix extraction (fixes #2154) Stop using the OoyalaIE, extract the f4m url instead. 2014-03-29 18:55:12 +08:00			`'description': self._og_search_description(webpage),`
			`'thumbnail': self._og_search_thumbnail(webpage),`
			`}`