From 40daaef417d0da5d32933c721596c44c4ea0d616 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Fri, 27 Jan 2017 15:57:03 +0100 Subject: [PATCH] [visir] Add new information extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 21 ++++++ youtube_dl/extractor/visir.py | 104 +++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 youtube_dl/extractor/visir.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 81366f933..984c26b06 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1119,6 +1119,7 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .visir import VisirMediaIE from .viu import ( ViuIE, ViuPlaylistIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a23486620..15f0b04ae 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -81,6 +81,7 @@ from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE +from .visir import VisirMediaIE class GenericIE(InfoExtractor): @@ -1473,6 +1474,20 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [TwentyMinutenIE.ie_key()], + }, + { + # Visir embed + 'url': 'http://www.visir.is/-viljum-hjalpa-theim-ad-hjalpa-sjalfum-ser-/article/2017170129096', + 'info_dict': { + 'id': 'VTV8CE25BB4-9132-48AD-A2EE-00AF0BAA02A0', + 'ext': 'mp4', + 'title': 'H\u00f3pur nemenda s\u00f6fnu\u00f0u pening fyrir Ge\u00f0hj\u00e1lp', + 'description': None, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [VisirMediaIE.ie_key()], } # { # # TODO: find another test @@ -2438,6 +2453,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( openload_urls, ie=OpenloadIE.ie_key()) + # Look for Visir embeds + visir_urls = VisirMediaIE._extract_urls(webpage) + if visir_urls: + return _playlist_from_matches( + visir_urls, ie=VisirMediaIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/visir.py b/youtube_dl/extractor/visir.py new file mode 100644 index 000000000..a83928cb4 --- /dev/null +++ b/youtube_dl/extractor/visir.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + base_url, + remove_start, + urljoin, +) + + +class VisirMediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?visir\.is/section(?:/media)?/.+?fileid=(?P[^/]+)$' + _TESTS = [{ + 'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP51729', + 'md5': '1486324696d1b9f30fcea985a7922f2c', + 'info_dict': { + 'id': 'CLP51729', + 'display_id': 'CLP51729', + 'ext': 'mp4', + 'title': 'Gu\u00f0j\u00f3n: Mj\u00f6g j\u00e1kv\u00e6\u00f0ur \u00e1 framhaldi\u00f0', + 'description': None, + 'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/51729_3.jpg' + }, + }, { + 'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP45905', + 'info_dict': { + 'id': 'CLP45905', + 'display_id': 'CLP45905', + 'ext': 'mp4', + 'title': 'Eva Laufey - Nau\u00f0synlegt a\u00f0 b\u00f6rn f\u00e1i a\u00f0 koma n\u00e1l\u00e6gt matarger\u00f0', + 'description': 'md5:24422433a08d270a3690d149edf113b8', + 'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/45905_3.jpg', + }, + 'params': { + 'skip_download': True, + }, + }] + + @staticmethod + def _extract_urls(webpage): + media_base_url = 'http://www.visir.is/section/media/?template=iplayer&fileid=%s' + video_ids = [media_base_url % m.group('id') for m in re.finditer( + r'App\.Player\.Init\(\{[^\}]*Type:\s*\'(?:audio|video)\'[^\}]+FileId:\s*\'(?P.+?)\'[^\}]+Host:\s*\'visirvod\.365cdn\.is\'', + webpage)] + return video_ids + + def _extract_formats(self, filename, video_id, media_type): + playlist_url = 'http://visirvod.365cdn.is/hls-vod/_definst_/mp4:%s/playlist.m3u8' % filename + if media_type == 'video': + formats = self._extract_wowza_formats( + playlist_url, video_id, skip_protocols=['dash']) + else: + formats = self._extract_wowza_formats( + playlist_url, video_id, skip_protocols=['dash', 'f4m', 'm3u8']) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + regex_pattern = r'App\.Player\.Init\s*\(\s*\{[^\}]*%s:[^\}]*?\'(.+?)\'' + video_id = self._search_regex( + regex_pattern % 'FileId', + webpage, 'video id') + filename = self._search_regex( + regex_pattern % 'File', + webpage, 'filename') + media_type = self._search_regex( + regex_pattern % 'Type', + webpage, 'media type') + + formats = self._extract_formats(filename, video_id, media_type) + + title = self._search_regex( + regex_pattern % 'Title', + webpage, 'video title', default=None) + if not title: + title = self._og_search_title(webpage) + if title: + title = remove_start(title, 'VĂ­sir -').strip() + + description = self._og_search_description(webpage, default=None) + + thumbnail = self._search_regex( + regex_pattern % '(?:I|i)mage', + webpage, 'video title', default=None) + if thumbnail: + if thumbnail.startswith('/'): + thumbnail = urljoin(base_url(url), thumbnail) + else: + thumbnail = self._og_search_thumbnail(webpage, default=None) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + }