From caa7e0736f1469a003983a63b9b0be9fe181e286 Mon Sep 17 00:00:00 2001 From: anovicecodemonkey Date: Thu, 30 Jan 2014 00:56:35 +1030 Subject: [PATCH] Add support for The Guardian website Hi all, This commit adds support for TheGuardian.com. Website of The Guardian newspaper. As I am new to Python programming and this is my first contribution to the project, I would appreciate any and all feedback. Regards. --- youtube_dl/extractor/theguardian.py | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 youtube_dl/extractor/theguardian.py diff --git a/youtube_dl/extractor/theguardian.py b/youtube_dl/extractor/theguardian.py new file mode 100644 index 000000000..b3f64244a --- /dev/null +++ b/youtube_dl/extractor/theguardian.py @@ -0,0 +1,31 @@ +import re +from .common import InfoExtractor +class TheGuardianIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?theguardian\.com/.*/.*/.*/.*/(?P.*)/?' + + _TEST = { + u'url': u'http://www.theguardian.com/world/video/2014/jan/29/president-barack-obama-state-union-address-video', + u'file': u'president-barack-obama-state-union-address-video.mp4', + u'md5': u'c3c4d57157bd28a20e877a0ec796a6cc', + u'info_dict': { + u"title": u"President Barack Obama delivers State of the Union address – video" + } +} + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = (mobj.group('video_id')) + webpage_url = (url) + webpage = self._download_webpage(webpage_url, video_id) + # Log that we are starting to parse the page. + self.report_extraction(video_id) + # Search for the video url (which is always a .mp4 file; the path to which is set in the JSON JWPlayerOptions() object.) + # Sometimes there's whitespace that also needs to be accounted for. + video_url = self._html_search_regex(r'file\s*:\s*\'(.*)\',', webpage, u'video URL') # e.g. file : 'video.mp4' + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': self._og_search_title(webpage), + }]