1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-13 06:27:26 +08:00
2015-07-25 20:02:59 -05:00

97 lines
3.6 KiB
Python

from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
url_basename,
unescapeHTML,
js_to_json,
ExtractorError,
)
import re
class CBCIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cbc.ca/[^/]+/'
_TESTS = [
{
'url': 'http://www.cbc.ca/news/thenational/the-real-cost-of-the-world-s-most-expensive-drug-1.3126338',
'info_dict': {
'id': 'if3k_n58u3hDrVX9dOXSTbtHBnSZGQpe',
'ext': 'flv',
'title': 'The real cost of the world\'s most expensive drug',
'description': 'md5:407fb27bb8b10c2e1447bbad0c27e551',
},
'add_ie': ['ThePlatform'],
},
{
'url': 'http://www.cbc.ca/player/News/ID/2672225049/',
'info_dict': {
'id': 'VfTVl5c2pr40a9jxAMWGIRZO8Mz4ubPZ',
'ext': 'flv',
'title': 'WATCH: New Earth from space image released by NASA',
'description': 'md5:3ddd36b5d1066a067a0b0c8891a72506',
},
'add_ie': ['ThePlatform'],
},
{
'url': 'http://www.cbc.ca/natureofthings/episodes/stonehenge-uncovered',
'info_dict': {
'id': 'QPnDq_piKkN5x0dH7SQF85cyJb_KOsG0',
'ext': 'flv',
'title': 'Stonehenge Uncovered',
},
'add_ie': ['ThePlatform'],
'skip': 'Canada only',
}
]
def _real_extract(self, url):
# from http://www.cbc.ca/i/caffeine/js/Caffeine.js
# TP_FEED_DOMAIN:"http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?"
# MPX_ACCOUNT_PID:"h9dtGB"
tp_feed_domain = "http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?"
mpx_account_id = "h9dtGB"
name = url_basename(url)
webpage = self._download_webpage(url, name)
title = self._html_search_regex('<title>\s*(.+?)\s*</title>', webpage, 'title')
cbcapp = re.findall(
r'CBC.APP.Caffeine.initInstance\((.+?)\);', webpage, re.DOTALL)
clipids = []
for jstr in cbcapp:
vdata = self._parse_json(
jstr, 'javascript chunk', transform_source=js_to_json)
if 'clipId' in vdata:
if vdata['clipId'] not in clipids:
clipids.append(vdata['clipId'])
vids = []
for cid in clipids:
feedurl = tp_feed_domain + \
'range=1-1&byContent=byReleases%3DbyId%253D' + cid
feedpage = self._download_webpage(feedurl, 'feed for clip ' + cid)
cjson = self._parse_json(
feedpage, 'clip feed json', transform_source=js_to_json)
for ent in cjson.get('entries', []):
for content in ent.get('content', []):
# assuming multi-content is playlist or multi-part video
vid = {}
for release in content.get('releases', []):
if 'url' in vid:
self.report_warning(
cid + ': multi-release video? Skipping, if content is missing please file a bug report')
continue
vid['url'] = 'http://link.theplatform.com/s/' + \
mpx_account_id + '/' + release['pid']
if 'url' in vid:
vids.append(self.url_result(vid['url']))
if not vids:
raise ExtractorError('No video found', expected=True)
if len(vids) > 1:
return self.playlist_result(vids, name, title)
return vids[0]