From c9ae209e2c6c4fa1fb953fe31ece81817da37068 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Mon, 30 Jan 2017 12:57:15 +0100 Subject: [PATCH] [visir] Complete rewrite of the visir information extractor. - Articles are handled by VisirArticleIE, the media sites by VisirMediaIE. - The video m3u8 playlist is retrieved from a json playlist, but unfortunately this does not always word, e.g. http://www.visir.is/kaup-thriggja-risaskipa-styrkja-tengsl-islands-og-graenlands/article/2017170129275 - A better way to do it, is to mimic the javascript code from the visir embedded video site, e.g. http://www.visir.is/section/media/?template=iplayer&fileid=SRC38BFD200-8465-4A46-9F2A-342E613568E1 --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/generic.py | 21 ---- youtube_dl/extractor/visir.py | 161 ++++++++++++++++++----------- 3 files changed, 106 insertions(+), 82 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 984c26b06..1d79860d0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1119,7 +1119,11 @@ from .viki import ( VikiIE, VikiChannelIE, ) -from .visir import VisirMediaIE +from .visir import ( + VisirBaseIE, + VisirMediaIE, + VisirArticleIE, +) from .viu import ( ViuIE, ViuPlaylistIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 15f0b04ae..a23486620 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -81,7 +81,6 @@ from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE -from .visir import VisirMediaIE class GenericIE(InfoExtractor): @@ -1474,20 +1473,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [TwentyMinutenIE.ie_key()], - }, - { - # Visir embed - 'url': 'http://www.visir.is/-viljum-hjalpa-theim-ad-hjalpa-sjalfum-ser-/article/2017170129096', - 'info_dict': { - 'id': 'VTV8CE25BB4-9132-48AD-A2EE-00AF0BAA02A0', - 'ext': 'mp4', - 'title': 'H\u00f3pur nemenda s\u00f6fnu\u00f0u pening fyrir Ge\u00f0hj\u00e1lp', - 'description': None, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': [VisirMediaIE.ie_key()], } # { # # TODO: find another test @@ -2453,12 +2438,6 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( openload_urls, ie=OpenloadIE.ie_key()) - # Look for Visir embeds - visir_urls = VisirMediaIE._extract_urls(webpage) - if visir_urls: - return _playlist_from_matches( - visir_urls, ie=VisirMediaIE.ie_key()) - # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/visir.py b/youtube_dl/extractor/visir.py index a83928cb4..7a67034ea 100644 --- a/youtube_dl/extractor/visir.py +++ b/youtube_dl/extractor/visir.py @@ -5,100 +5,141 @@ import re from .common import InfoExtractor from ..utils import ( - base_url, + NO_DEFAULT, + js_to_json, remove_start, urljoin, ) -class VisirMediaIE(InfoExtractor): +class VisirBaseIE(InfoExtractor): + _VALID_URL = r'visir:(?P[^:]+):(?P(?:audio|video)):(?P\d+):(?P\d+)' + _BASE_URL = 'http://www.visir.is' + + def _extract_player_info(self, video_id, webpage, default=NO_DEFAULT): + field_names = ('FileId', 'Categoryid', 'Subcategoryid', 'Type', 'File') + player_info_regex = r'App\.Player\.Init\s*\(\s*(.+?)\)' + player_info_script = self._search_regex( + player_info_regex, webpage, 'player info', default=default) + if not player_info_script: + return len(field_names) * [None] + player_info_dict = self._parse_json( + player_info_script, video_id, transform_source=js_to_json) + return (player_info_dict.get(name) for name in field_names) + + def _extract_fields_from_media_list(self, video_id, category, subcategory, media_type): + url = 'http://www.visir.is/section/MEDIA?template=related_json&kat=%s&subkat=%s' % (category, subcategory) + if media_type == 'audio': + url += '&type=audio' + media_collection = self._download_json(url, video_id) + field_names = ('link', 'file', 'title', 'image') + return next( + (e.get(field) for field in field_names) for e in media_collection if e.get('mediaid') == video_id) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, media_type, category_id, subcategory_id = mobj.group( + 'id', 'type', 'category', 'subcategory') + media_link, _, _, _ = self._extract_fields_from_media_list( + video_id, category_id, subcategory_id, media_type) + return self.url_result( + urljoin(self._BASE_URL, media_link), ie=VisirMediaIE.ie_key()) + + +class VisirMediaIE(VisirBaseIE): _VALID_URL = r'https?://(?:www\.)?visir\.is/section(?:/media)?/.+?fileid=(?P[^/]+)$' _TESTS = [{ 'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP51729', 'md5': '1486324696d1b9f30fcea985a7922f2c', 'info_dict': { 'id': 'CLP51729', - 'display_id': 'CLP51729', 'ext': 'mp4', - 'title': 'Gu\u00f0j\u00f3n: Mj\u00f6g j\u00e1kv\u00e6\u00f0ur \u00e1 framhaldi\u00f0', + 'title': u'Guðjón: Mjög jákvæður á framhaldið', 'description': None, - 'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/51729_3.jpg' + 'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/ExternalData/IsBolti_clips/51729_3.jpg' }, }, { - 'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP45905', + 'url': 'http://www.visir.is/section/MEDIA98&fileid=CLP49923', 'info_dict': { - 'id': 'CLP45905', - 'display_id': 'CLP45905', - 'ext': 'mp4', - 'title': 'Eva Laufey - Nau\u00f0synlegt a\u00f0 b\u00f6rn f\u00e1i a\u00f0 koma n\u00e1l\u00e6gt matarger\u00f0', - 'description': 'md5:24422433a08d270a3690d149edf113b8', - 'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/45905_3.jpg', + 'id': 'CLP49923', + 'ext': 'mp3', + 'title': u'Ósk Gunnars - Sigga Soffía og dansverkið FUBAR', + 'description': u'Ósk Gunnars alla virka daga á FM957 frá 13-17', }, 'params': { 'skip_download': True, }, }] - @staticmethod - def _extract_urls(webpage): - media_base_url = 'http://www.visir.is/section/media/?template=iplayer&fileid=%s' - video_ids = [media_base_url % m.group('id') for m in re.finditer( - r'App\.Player\.Init\(\{[^\}]*Type:\s*\'(?:audio|video)\'[^\}]+FileId:\s*\'(?P.+?)\'[^\}]+Host:\s*\'visirvod\.365cdn\.is\'', - webpage)] - return video_ids - - def _extract_formats(self, filename, video_id, media_type): - playlist_url = 'http://visirvod.365cdn.is/hls-vod/_definst_/mp4:%s/playlist.m3u8' % filename - if media_type == 'video': - formats = self._extract_wowza_formats( - playlist_url, video_id, skip_protocols=['dash']) - else: - formats = self._extract_wowza_formats( - playlist_url, video_id, skip_protocols=['dash', 'f4m', 'm3u8']) + def _extract_formats(self, video_id, playlist_url, filepath): + formats = self._extract_wowza_formats( + playlist_url, video_id, skip_protocols=['dash']) + formats.append( + {'url': urljoin('http://static.visir.is/', filepath)}) self._sort_formats(formats) return formats def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - regex_pattern = r'App\.Player\.Init\s*\(\s*\{[^\}]*%s:[^\}]*?\'(.+?)\'' - video_id = self._search_regex( - regex_pattern % 'FileId', - webpage, 'video id') - filename = self._search_regex( - regex_pattern % 'File', - webpage, 'filename') - media_type = self._search_regex( - regex_pattern % 'Type', - webpage, 'media type') - - formats = self._extract_formats(filename, video_id, media_type) - - title = self._search_regex( - regex_pattern % 'Title', - webpage, 'video title', default=None) - if not title: - title = self._og_search_title(webpage) - if title: - title = remove_start(title, 'Vísir -').strip() + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) description = self._og_search_description(webpage, default=None) - thumbnail = self._search_regex( - regex_pattern % '(?:I|i)mage', - webpage, 'video title', default=None) - if thumbnail: - if thumbnail.startswith('/'): - thumbnail = urljoin(base_url(url), thumbnail) - else: - thumbnail = self._og_search_thumbnail(webpage, default=None) + _, category_id, subcategory_id, media_type, filepath = self._extract_player_info( + video_id, webpage) + + _, playlist_url, title, thumbnail = self._extract_fields_from_media_list( + video_id, category_id, subcategory_id, media_type) + + formats = self._extract_formats( + video_id, playlist_url, filepath) return { 'id': video_id, - 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'formats': formats, } + +class VisirArticleIE(VisirBaseIE): + _VALID_URL = r'https?://(?:www\.)?visir\.is/.+/article/(?P\d+)$' + + _TEST = { + 'url': 'http://www.visir.is/landsmenn-minntust-birnu-brjansdottur/article/2017170128825', + 'info_dict': { + 'id': '2017170128825', + 'title': u'Landsmenn minntust Birnu Brjánsdóttur', + 'description': u'Hundruð kerta voru tendruð á Arnarhóli í ljósaskiptunum í dag.' + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + title = remove_start(self._og_search_title(webpage), u'Vísir -').strip() + description = self._og_search_description(webpage, default=None) + + entries = [] + + # Try to find the main video of the article: + video_id, category_id, subcategory_id, media_type, _= self._extract_player_info( + article_id, webpage, default=None) # TODO: default? + if video_id and category_id and subcategory_id and media_type in ('video', 'audio'): + entries.append(self.url_result( + 'visir:%s:%s:%s:%s' % (video_id, media_type, category_id, subcategory_id), + ie=VisirBaseIE.ie_key())) + + # Try to find embedded visir videos: + video_urls = [m.group('url') for m in re.finditer( + r']+src=(["\'])(?Phttp://www\.visir\.is/section/.+?)\1', webpage)] + for url in video_urls: + entries.append(self.url_result(url, ie=VisirMediaIE.ie_key())) + + return self.playlist_result( + entries, + playlist_id=article_id, + playlist_title=title, + playlist_description=description)