diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317e..ff66a74bd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -27,6 +27,7 @@ from .ard import ( ARDMediathekIE, SportschauIE, ) +from .arstechnica import ArstechnicaIE from .arte import ( ArteTvIE, ArteTVPlus7IE, diff --git a/youtube_dl/extractor/arstechnica.py b/youtube_dl/extractor/arstechnica.py new file mode 100644 index 000000000..f9353daa8 --- /dev/null +++ b/youtube_dl/extractor/arstechnica.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals +import urllib +import ast + +from .common import InfoExtractor + + +class ArstechnicaIE(InfoExtractor): + _VALID_URL = r'https?://(www.)?arstechnica.com/[a-z]+/[0-9]+/[0-9]+/(?P[-a-z0-9]+)/??' + _TEST = { + 'url': 'http://arstechnica.com/security/2015/09/video-3d-printed-tsa-travel-sentry-keys-really-do-open-tsa-locks/', + 'md5': 'd07dc4dd168a8bc7bf782dac7ad691db', + 'info_dict': { + 'id': 'video-3d-printed-tsa-travel-sentry-keys-really-do-open-tsa-locks', + 'ext': 'mp4', + 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'We downloaded the models from GitHub, tweaked, printed, and gained access.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + protocol = url.split("://")[0] + + title = self._html_search_regex(r'(?:Video: )*(.*?)', webpage, 'title') + + initial_js_url = protocol + ":" + self._html_search_regex(r'', webpage, 'initial_js_url') + initial_js_content = self._download_webpage(initial_js_url, video_id) + + loader_js_url = self._search_regex(r"var url = \\'([a-z:/.?]+)\\'", initial_js_content, 'loader_js_url') + params = { + 'videoId': self._search_regex(r"videoId: \\'([a-z0-9]+)\\'", initial_js_content, 'videoId'), + 'playerId': self._search_regex(r"playerId: \\'([a-z0-9]+)\\'", initial_js_content, 'playerId'), + 'target': self._search_regex(r"target: \\'([a-z0-9]+)\\'", initial_js_content, 'target'), + } + loader_js_url = "%s%s" % (loader_js_url, urllib.urlencode(params)) + loader_js_content = self._download_webpage(loader_js_url, video_id) + + sources_list = ast.literal_eval(self._search_regex(r'"sources":\[(.*)\],"thumb', loader_js_content, 'sources_list')) + + formats = [] + + for source in sources_list: + formats.append({ + 'format_id': source['type'], + 'url': source['src'], + 'quality': source['quality'], + }) + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'formats': formats, + 'thumbnail': self._search_regex(r'"thumb_path":"(.*)","title', loader_js_content, 'thumb_path'), + }