from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( url_basename, unescapeHTML, js_to_json, ExtractorError, ) import re class CBCIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cbc.ca/[^/]+/' _TESTS = [ { 'url': 'http://www.cbc.ca/news/thenational/the-real-cost-of-the-world-s-most-expensive-drug-1.3126338', 'info_dict': { 'id': 'if3k_n58u3hDrVX9dOXSTbtHBnSZGQpe', 'ext': 'flv', 'title': 'The real cost of the world\'s most expensive drug', 'description': 'md5:407fb27bb8b10c2e1447bbad0c27e551', }, 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.cbc.ca/player/News/ID/2672225049/', 'info_dict': { 'id': 'VfTVl5c2pr40a9jxAMWGIRZO8Mz4ubPZ', 'ext': 'flv', 'title': 'WATCH: New Earth from space image released by NASA', 'description': 'md5:3ddd36b5d1066a067a0b0c8891a72506', }, 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.cbc.ca/natureofthings/episodes/stonehenge-uncovered', 'info_dict': { 'id': 'QPnDq_piKkN5x0dH7SQF85cyJb_KOsG0', 'ext': 'flv', 'title': 'Stonehenge Uncovered', }, 'add_ie': ['ThePlatform'], 'skip': 'Canada only', } ] def _real_extract(self, url): # from http://www.cbc.ca/i/caffeine/js/Caffeine.js # TP_FEED_DOMAIN:"http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?" # MPX_ACCOUNT_PID:"h9dtGB" tp_feed_domain = "http://tpfeed.cbc.ca/f/h9dtGB/5akSXx4Ng_Zn?" mpx_account_id = "h9dtGB" name = url_basename(url) webpage = self._download_webpage(url, name) title = unescapeHTML( self._search_regex('