From 4866273f503de2344bb5b8a98f9b58a062803029 Mon Sep 17 00:00:00 2001 From: felix Date: Fri, 20 Mar 2015 22:37:02 +0100 Subject: [PATCH] 4oD extractor, with a testcase mostly unusable, since FlashAccess DRM is not implemented, here or in ffmpeg. youtube-dl can at least download subtitles and metadata, and make the stream URL available through -j for external decryption. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/channel4.py | 78 ++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/channel4.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7eb9b4fbb..11ecb4f8f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,7 @@ from .cbsnews import CBSNewsIE from .cbssports import CBSSportsIE from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE +from .channel4 import Channel4IE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE from .chirbit import ( diff --git a/youtube_dl/extractor/channel4.py b/youtube_dl/extractor/channel4.py new file mode 100644 index 000000000..5143b9f03 --- /dev/null +++ b/youtube_dl/extractor/channel4.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + unified_strdate, + ExtractorError +) + +class Channel4IE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?channel4\.com/programmes/(?P.*?)/on-demand/(?P.*)' + + _TESTS = [{ + 'url': 'http://www.channel4.com/programmes/black-mirror/on-demand/49114-002', + 'info_dict': { + 'id': '49114-002', + '_programme_title': "Black Mirror", + 'title': "15 Million Merits", + 'description': "In the near future, everyone is confined to a life of strange physical drudgery. The only way to escape is to enter the 'Hot Shot' talent show and pray you can impress the judges.", + 'duration': 222780, + }, + 'params': { + # unimplemented DRM + 'skip_download': True, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = json.loads(self._search_regex( + r'onDemand.selectedEpisode = (?P{.+?});\n', + webpage, 'video data json')) + + thumbnails = [{ + 'url': video_data["pictureComponent"]["url"] + }] + + request_id = video_data["requestId"] + # XXX: the Flash player also puts the Unix timestamp in the query string. the download works without it just fine, though + asset_url = 'http://ais.channel4.com/asset/%s' % (request_id) + stream_info = self._download_xml(asset_url, video_id) + + service_report = stream_info.find('./serviceReport') + if service_report.attrib.get('returnCode') != "200": + raise ExtractorError(service_report.findtext('./description'), expected=True) + + subtitles = stream_info.findtext('./assetInfo/subtitlesFileUri') + if subtitles: + subtitles = { + 'en': [{ + 'ext': 'sami', + 'url': compat_urlparse.urljoin(asset_url, subtitles), + }] + } + + formats = self._extract_f4m_formats(stream_info.findtext('./assetInfo/uriData/streamUri'), video_id) + + return { + 'id': video_id, + '_programme_title': stream_info.findtext('./assetInfo/brandTitle'), + 'title': stream_info.findtext('./assetInfo/episodeTitle'), + 'upload_date': unified_strdate(video_data['txDate'] + ' ' + video_data.get('txTime', '')), + 'description': video_data.get('synopsis'), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + 'duration': video_data["assetDuration"] * 60, + '_drm_token': stream_info.findtext('./assetInfo/uriData/token'), + '_programme_series': video_data.get("seriesNumber"), + '_programme_episode': video_data.get("episodeNumber"), + }