1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-13 20:17:15 +08:00

[visir] Complete rewrite of the visir information extractor.

- Articles are handled by VisirArticleIE, the media sites by
   VisirMediaIE.
 - The video m3u8 playlist is retrieved from a json playlist, but
   unfortunately this does not always word, e.g. http://www.visir.is/kaup-thriggja-risaskipa-styrkja-tengsl-islands-og-graenlands/article/2017170129275
 - A better way to do it, is to mimic the javascript code from the visir
   embedded video site, e.g. http://www.visir.is/section/media/?template=iplayer&fileid=SRC38BFD200-8465-4A46-9F2A-342E613568E1
This commit is contained in:
Alex Seiler 2017-01-30 12:57:15 +01:00
parent 40daaef417
commit c9ae209e2c
3 changed files with 106 additions and 82 deletions

View File

@ -1119,7 +1119,11 @@ from .viki import (
VikiIE, VikiIE,
VikiChannelIE, VikiChannelIE,
) )
from .visir import VisirMediaIE from .visir import (
VisirBaseIE,
VisirMediaIE,
VisirArticleIE,
)
from .viu import ( from .viu import (
ViuIE, ViuIE,
ViuPlaylistIE, ViuPlaylistIE,

View File

@ -81,7 +81,6 @@ from .videa import VideaIE
from .twentymin import TwentyMinutenIE from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE from .ustream import UstreamIE
from .openload import OpenloadIE from .openload import OpenloadIE
from .visir import VisirMediaIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1474,20 +1473,6 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
'add_ie': [TwentyMinutenIE.ie_key()], 'add_ie': [TwentyMinutenIE.ie_key()],
},
{
# Visir embed
'url': 'http://www.visir.is/-viljum-hjalpa-theim-ad-hjalpa-sjalfum-ser-/article/2017170129096',
'info_dict': {
'id': 'VTV8CE25BB4-9132-48AD-A2EE-00AF0BAA02A0',
'ext': 'mp4',
'title': 'H\u00f3pur nemenda s\u00f6fnu\u00f0u pening fyrir Ge\u00f0hj\u00e1lp',
'description': None,
},
'params': {
'skip_download': True,
},
'add_ie': [VisirMediaIE.ie_key()],
} }
# { # {
# # TODO: find another test # # TODO: find another test
@ -2453,12 +2438,6 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches( return _playlist_from_matches(
openload_urls, ie=OpenloadIE.ie_key()) openload_urls, ie=OpenloadIE.ie_key())
# Look for Visir embeds
visir_urls = VisirMediaIE._extract_urls(webpage)
if visir_urls:
return _playlist_from_matches(
visir_urls, ie=VisirMediaIE.ie_key())
# Looking for http://schema.org/VideoObject # Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld( json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject') webpage, video_id, default={}, expected_type='VideoObject')

View File

@ -5,100 +5,141 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
base_url, NO_DEFAULT,
js_to_json,
remove_start, remove_start,
urljoin, urljoin,
) )
class VisirMediaIE(InfoExtractor): class VisirBaseIE(InfoExtractor):
_VALID_URL = r'visir:(?P<id>[^:]+):(?P<type>(?:audio|video)):(?P<category>\d+):(?P<subcategory>\d+)'
_BASE_URL = 'http://www.visir.is'
def _extract_player_info(self, video_id, webpage, default=NO_DEFAULT):
field_names = ('FileId', 'Categoryid', 'Subcategoryid', 'Type', 'File')
player_info_regex = r'App\.Player\.Init\s*\(\s*(.+?)\)'
player_info_script = self._search_regex(
player_info_regex, webpage, 'player info', default=default)
if not player_info_script:
return len(field_names) * [None]
player_info_dict = self._parse_json(
player_info_script, video_id, transform_source=js_to_json)
return (player_info_dict.get(name) for name in field_names)
def _extract_fields_from_media_list(self, video_id, category, subcategory, media_type):
url = 'http://www.visir.is/section/MEDIA?template=related_json&kat=%s&subkat=%s' % (category, subcategory)
if media_type == 'audio':
url += '&type=audio'
media_collection = self._download_json(url, video_id)
field_names = ('link', 'file', 'title', 'image')
return next(
(e.get(field) for field in field_names) for e in media_collection if e.get('mediaid') == video_id)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id, media_type, category_id, subcategory_id = mobj.group(
'id', 'type', 'category', 'subcategory')
media_link, _, _, _ = self._extract_fields_from_media_list(
video_id, category_id, subcategory_id, media_type)
return self.url_result(
urljoin(self._BASE_URL, media_link), ie=VisirMediaIE.ie_key())
class VisirMediaIE(VisirBaseIE):
_VALID_URL = r'https?://(?:www\.)?visir\.is/section(?:/media)?/.+?fileid=(?P<id>[^/]+)$' _VALID_URL = r'https?://(?:www\.)?visir\.is/section(?:/media)?/.+?fileid=(?P<id>[^/]+)$'
_TESTS = [{ _TESTS = [{
'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP51729', 'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP51729',
'md5': '1486324696d1b9f30fcea985a7922f2c', 'md5': '1486324696d1b9f30fcea985a7922f2c',
'info_dict': { 'info_dict': {
'id': 'CLP51729', 'id': 'CLP51729',
'display_id': 'CLP51729',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Gu\u00f0j\u00f3n: Mj\u00f6g j\u00e1kv\u00e6\u00f0ur \u00e1 framhaldi\u00f0', 'title': u'Guðjón: Mjög jákvæður á framhaldið',
'description': None, 'description': None,
'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/51729_3.jpg' 'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/ExternalData/IsBolti_clips/51729_3.jpg'
}, },
}, { }, {
'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP45905', 'url': 'http://www.visir.is/section/MEDIA98&fileid=CLP49923',
'info_dict': { 'info_dict': {
'id': 'CLP45905', 'id': 'CLP49923',
'display_id': 'CLP45905', 'ext': 'mp3',
'ext': 'mp4', 'title': u'Ósk Gunnars - Sigga Soffía og dansverkið FUBAR',
'title': 'Eva Laufey - Nau\u00f0synlegt a\u00f0 b\u00f6rn f\u00e1i a\u00f0 koma n\u00e1l\u00e6gt matarger\u00f0', 'description': u'Ósk Gunnars alla virka daga á FM957 frá 13-17',
'description': 'md5:24422433a08d270a3690d149edf113b8',
'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/45905_3.jpg',
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}] }]
@staticmethod def _extract_formats(self, video_id, playlist_url, filepath):
def _extract_urls(webpage): formats = self._extract_wowza_formats(
media_base_url = 'http://www.visir.is/section/media/?template=iplayer&fileid=%s' playlist_url, video_id, skip_protocols=['dash'])
video_ids = [media_base_url % m.group('id') for m in re.finditer( formats.append(
r'App\.Player\.Init\(\{[^\}]*Type:\s*\'(?:audio|video)\'[^\}]+FileId:\s*\'(?P<id>.+?)\'[^\}]+Host:\s*\'visirvod\.365cdn\.is\'', {'url': urljoin('http://static.visir.is/', filepath)})
webpage)]
return video_ids
def _extract_formats(self, filename, video_id, media_type):
playlist_url = 'http://visirvod.365cdn.is/hls-vod/_definst_/mp4:%s/playlist.m3u8' % filename
if media_type == 'video':
formats = self._extract_wowza_formats(
playlist_url, video_id, skip_protocols=['dash'])
else:
formats = self._extract_wowza_formats(
playlist_url, video_id, skip_protocols=['dash', 'f4m', 'm3u8'])
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, video_id)
regex_pattern = r'App\.Player\.Init\s*\(\s*\{[^\}]*%s:[^\}]*?\'(.+?)\''
video_id = self._search_regex(
regex_pattern % 'FileId',
webpage, 'video id')
filename = self._search_regex(
regex_pattern % 'File',
webpage, 'filename')
media_type = self._search_regex(
regex_pattern % 'Type',
webpage, 'media type')
formats = self._extract_formats(filename, video_id, media_type)
title = self._search_regex(
regex_pattern % 'Title',
webpage, 'video title', default=None)
if not title:
title = self._og_search_title(webpage)
if title:
title = remove_start(title, 'Vísir -').strip()
description = self._og_search_description(webpage, default=None) description = self._og_search_description(webpage, default=None)
thumbnail = self._search_regex( _, category_id, subcategory_id, media_type, filepath = self._extract_player_info(
regex_pattern % '(?:I|i)mage', video_id, webpage)
webpage, 'video title', default=None)
if thumbnail: _, playlist_url, title, thumbnail = self._extract_fields_from_media_list(
if thumbnail.startswith('/'): video_id, category_id, subcategory_id, media_type)
thumbnail = urljoin(base_url(url), thumbnail)
else: formats = self._extract_formats(
thumbnail = self._og_search_thumbnail(webpage, default=None) video_id, playlist_url, filepath)
return { return {
'id': video_id, 'id': video_id,
'display_id': display_id,
'title': title, 'title': title,
'description': description, 'description': description,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'formats': formats, 'formats': formats,
} }
class VisirArticleIE(VisirBaseIE):
_VALID_URL = r'https?://(?:www\.)?visir\.is/.+/article/(?P<id>\d+)$'
_TEST = {
'url': 'http://www.visir.is/landsmenn-minntust-birnu-brjansdottur/article/2017170128825',
'info_dict': {
'id': '2017170128825',
'title': u'Landsmenn minntust Birnu Brjánsdóttur',
'description': u'Hundruð kerta voru tendruð á Arnarhóli í ljósaskiptunum í dag.'
},
'playlist_count': 2,
}
def _real_extract(self, url):
article_id = self._match_id(url)
webpage = self._download_webpage(url, article_id)
title = remove_start(self._og_search_title(webpage), u'Vísir -').strip()
description = self._og_search_description(webpage, default=None)
entries = []
# Try to find the main video of the article:
video_id, category_id, subcategory_id, media_type, _= self._extract_player_info(
article_id, webpage, default=None) # TODO: default?
if video_id and category_id and subcategory_id and media_type in ('video', 'audio'):
entries.append(self.url_result(
'visir:%s:%s:%s:%s' % (video_id, media_type, category_id, subcategory_id),
ie=VisirBaseIE.ie_key()))
# Try to find embedded visir videos:
video_urls = [m.group('url') for m in re.finditer(
r'<iframe[^>]+src=(["\'])(?P<url>http://www\.visir\.is/section/.+?)\1', webpage)]
for url in video_urls:
entries.append(self.url_result(url, ie=VisirMediaIE.ie_key()))
return self.playlist_result(
entries,
playlist_id=article_id,
playlist_title=title,
playlist_description=description)