mirror of
https://github.com/l1ving/youtube-dl
synced 2025-03-13 20:47:19 +08:00
[visir] Complete rewrite of the visir information extractor.
- Articles are handled by VisirArticleIE, the media sites by VisirMediaIE. - The video m3u8 playlist is retrieved from a json playlist, but unfortunately this does not always word, e.g. http://www.visir.is/kaup-thriggja-risaskipa-styrkja-tengsl-islands-og-graenlands/article/2017170129275 - A better way to do it, is to mimic the javascript code from the visir embedded video site, e.g. http://www.visir.is/section/media/?template=iplayer&fileid=SRC38BFD200-8465-4A46-9F2A-342E613568E1
This commit is contained in:
parent
40daaef417
commit
c9ae209e2c
@ -1119,7 +1119,11 @@ from .viki import (
|
|||||||
VikiIE,
|
VikiIE,
|
||||||
VikiChannelIE,
|
VikiChannelIE,
|
||||||
)
|
)
|
||||||
from .visir import VisirMediaIE
|
from .visir import (
|
||||||
|
VisirBaseIE,
|
||||||
|
VisirMediaIE,
|
||||||
|
VisirArticleIE,
|
||||||
|
)
|
||||||
from .viu import (
|
from .viu import (
|
||||||
ViuIE,
|
ViuIE,
|
||||||
ViuPlaylistIE,
|
ViuPlaylistIE,
|
||||||
|
@ -81,7 +81,6 @@ from .videa import VideaIE
|
|||||||
from .twentymin import TwentyMinutenIE
|
from .twentymin import TwentyMinutenIE
|
||||||
from .ustream import UstreamIE
|
from .ustream import UstreamIE
|
||||||
from .openload import OpenloadIE
|
from .openload import OpenloadIE
|
||||||
from .visir import VisirMediaIE
|
|
||||||
|
|
||||||
|
|
||||||
class GenericIE(InfoExtractor):
|
class GenericIE(InfoExtractor):
|
||||||
@ -1474,20 +1473,6 @@ class GenericIE(InfoExtractor):
|
|||||||
'skip_download': True,
|
'skip_download': True,
|
||||||
},
|
},
|
||||||
'add_ie': [TwentyMinutenIE.ie_key()],
|
'add_ie': [TwentyMinutenIE.ie_key()],
|
||||||
},
|
|
||||||
{
|
|
||||||
# Visir embed
|
|
||||||
'url': 'http://www.visir.is/-viljum-hjalpa-theim-ad-hjalpa-sjalfum-ser-/article/2017170129096',
|
|
||||||
'info_dict': {
|
|
||||||
'id': 'VTV8CE25BB4-9132-48AD-A2EE-00AF0BAA02A0',
|
|
||||||
'ext': 'mp4',
|
|
||||||
'title': 'H\u00f3pur nemenda s\u00f6fnu\u00f0u pening fyrir Ge\u00f0hj\u00e1lp',
|
|
||||||
'description': None,
|
|
||||||
},
|
|
||||||
'params': {
|
|
||||||
'skip_download': True,
|
|
||||||
},
|
|
||||||
'add_ie': [VisirMediaIE.ie_key()],
|
|
||||||
}
|
}
|
||||||
# {
|
# {
|
||||||
# # TODO: find another test
|
# # TODO: find another test
|
||||||
@ -2453,12 +2438,6 @@ class GenericIE(InfoExtractor):
|
|||||||
return _playlist_from_matches(
|
return _playlist_from_matches(
|
||||||
openload_urls, ie=OpenloadIE.ie_key())
|
openload_urls, ie=OpenloadIE.ie_key())
|
||||||
|
|
||||||
# Look for Visir embeds
|
|
||||||
visir_urls = VisirMediaIE._extract_urls(webpage)
|
|
||||||
if visir_urls:
|
|
||||||
return _playlist_from_matches(
|
|
||||||
visir_urls, ie=VisirMediaIE.ie_key())
|
|
||||||
|
|
||||||
# Looking for http://schema.org/VideoObject
|
# Looking for http://schema.org/VideoObject
|
||||||
json_ld = self._search_json_ld(
|
json_ld = self._search_json_ld(
|
||||||
webpage, video_id, default={}, expected_type='VideoObject')
|
webpage, video_id, default={}, expected_type='VideoObject')
|
||||||
|
@ -5,100 +5,141 @@ import re
|
|||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
base_url,
|
NO_DEFAULT,
|
||||||
|
js_to_json,
|
||||||
remove_start,
|
remove_start,
|
||||||
urljoin,
|
urljoin,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class VisirMediaIE(InfoExtractor):
|
class VisirBaseIE(InfoExtractor):
|
||||||
|
_VALID_URL = r'visir:(?P<id>[^:]+):(?P<type>(?:audio|video)):(?P<category>\d+):(?P<subcategory>\d+)'
|
||||||
|
_BASE_URL = 'http://www.visir.is'
|
||||||
|
|
||||||
|
def _extract_player_info(self, video_id, webpage, default=NO_DEFAULT):
|
||||||
|
field_names = ('FileId', 'Categoryid', 'Subcategoryid', 'Type', 'File')
|
||||||
|
player_info_regex = r'App\.Player\.Init\s*\(\s*(.+?)\)'
|
||||||
|
player_info_script = self._search_regex(
|
||||||
|
player_info_regex, webpage, 'player info', default=default)
|
||||||
|
if not player_info_script:
|
||||||
|
return len(field_names) * [None]
|
||||||
|
player_info_dict = self._parse_json(
|
||||||
|
player_info_script, video_id, transform_source=js_to_json)
|
||||||
|
return (player_info_dict.get(name) for name in field_names)
|
||||||
|
|
||||||
|
def _extract_fields_from_media_list(self, video_id, category, subcategory, media_type):
|
||||||
|
url = 'http://www.visir.is/section/MEDIA?template=related_json&kat=%s&subkat=%s' % (category, subcategory)
|
||||||
|
if media_type == 'audio':
|
||||||
|
url += '&type=audio'
|
||||||
|
media_collection = self._download_json(url, video_id)
|
||||||
|
field_names = ('link', 'file', 'title', 'image')
|
||||||
|
return next(
|
||||||
|
(e.get(field) for field in field_names) for e in media_collection if e.get('mediaid') == video_id)
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
video_id, media_type, category_id, subcategory_id = mobj.group(
|
||||||
|
'id', 'type', 'category', 'subcategory')
|
||||||
|
media_link, _, _, _ = self._extract_fields_from_media_list(
|
||||||
|
video_id, category_id, subcategory_id, media_type)
|
||||||
|
return self.url_result(
|
||||||
|
urljoin(self._BASE_URL, media_link), ie=VisirMediaIE.ie_key())
|
||||||
|
|
||||||
|
|
||||||
|
class VisirMediaIE(VisirBaseIE):
|
||||||
_VALID_URL = r'https?://(?:www\.)?visir\.is/section(?:/media)?/.+?fileid=(?P<id>[^/]+)$'
|
_VALID_URL = r'https?://(?:www\.)?visir\.is/section(?:/media)?/.+?fileid=(?P<id>[^/]+)$'
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP51729',
|
'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP51729',
|
||||||
'md5': '1486324696d1b9f30fcea985a7922f2c',
|
'md5': '1486324696d1b9f30fcea985a7922f2c',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'CLP51729',
|
'id': 'CLP51729',
|
||||||
'display_id': 'CLP51729',
|
|
||||||
'ext': 'mp4',
|
'ext': 'mp4',
|
||||||
'title': 'Gu\u00f0j\u00f3n: Mj\u00f6g j\u00e1kv\u00e6\u00f0ur \u00e1 framhaldi\u00f0',
|
'title': u'Guðjón: Mjög jákvæður á framhaldið',
|
||||||
'description': None,
|
'description': None,
|
||||||
'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/51729_3.jpg'
|
'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/ExternalData/IsBolti_clips/51729_3.jpg'
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
'url': 'http://www.visir.is/section/MEDIA99&fileid=CLP45905',
|
'url': 'http://www.visir.is/section/MEDIA98&fileid=CLP49923',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': 'CLP45905',
|
'id': 'CLP49923',
|
||||||
'display_id': 'CLP45905',
|
'ext': 'mp3',
|
||||||
'ext': 'mp4',
|
'title': u'Ósk Gunnars - Sigga Soffía og dansverkið FUBAR',
|
||||||
'title': 'Eva Laufey - Nau\u00f0synlegt a\u00f0 b\u00f6rn f\u00e1i a\u00f0 koma n\u00e1l\u00e6gt matarger\u00f0',
|
'description': u'Ósk Gunnars alla virka daga á FM957 frá 13-17',
|
||||||
'description': 'md5:24422433a08d270a3690d149edf113b8',
|
|
||||||
'thumbnail': 'http://www.visir.is/apps/pbcsi.dll/urlget?url=/clips/45905_3.jpg',
|
|
||||||
},
|
},
|
||||||
'params': {
|
'params': {
|
||||||
'skip_download': True,
|
'skip_download': True,
|
||||||
},
|
},
|
||||||
}]
|
}]
|
||||||
|
|
||||||
@staticmethod
|
def _extract_formats(self, video_id, playlist_url, filepath):
|
||||||
def _extract_urls(webpage):
|
|
||||||
media_base_url = 'http://www.visir.is/section/media/?template=iplayer&fileid=%s'
|
|
||||||
video_ids = [media_base_url % m.group('id') for m in re.finditer(
|
|
||||||
r'App\.Player\.Init\(\{[^\}]*Type:\s*\'(?:audio|video)\'[^\}]+FileId:\s*\'(?P<id>.+?)\'[^\}]+Host:\s*\'visirvod\.365cdn\.is\'',
|
|
||||||
webpage)]
|
|
||||||
return video_ids
|
|
||||||
|
|
||||||
def _extract_formats(self, filename, video_id, media_type):
|
|
||||||
playlist_url = 'http://visirvod.365cdn.is/hls-vod/_definst_/mp4:%s/playlist.m3u8' % filename
|
|
||||||
if media_type == 'video':
|
|
||||||
formats = self._extract_wowza_formats(
|
formats = self._extract_wowza_formats(
|
||||||
playlist_url, video_id, skip_protocols=['dash'])
|
playlist_url, video_id, skip_protocols=['dash'])
|
||||||
else:
|
formats.append(
|
||||||
formats = self._extract_wowza_formats(
|
{'url': urljoin('http://static.visir.is/', filepath)})
|
||||||
playlist_url, video_id, skip_protocols=['dash', 'f4m', 'm3u8'])
|
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
return formats
|
return formats
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
display_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, display_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
regex_pattern = r'App\.Player\.Init\s*\(\s*\{[^\}]*%s:[^\}]*?\'(.+?)\''
|
|
||||||
video_id = self._search_regex(
|
|
||||||
regex_pattern % 'FileId',
|
|
||||||
webpage, 'video id')
|
|
||||||
filename = self._search_regex(
|
|
||||||
regex_pattern % 'File',
|
|
||||||
webpage, 'filename')
|
|
||||||
media_type = self._search_regex(
|
|
||||||
regex_pattern % 'Type',
|
|
||||||
webpage, 'media type')
|
|
||||||
|
|
||||||
formats = self._extract_formats(filename, video_id, media_type)
|
|
||||||
|
|
||||||
title = self._search_regex(
|
|
||||||
regex_pattern % 'Title',
|
|
||||||
webpage, 'video title', default=None)
|
|
||||||
if not title:
|
|
||||||
title = self._og_search_title(webpage)
|
|
||||||
if title:
|
|
||||||
title = remove_start(title, 'Vísir -').strip()
|
|
||||||
|
|
||||||
description = self._og_search_description(webpage, default=None)
|
description = self._og_search_description(webpage, default=None)
|
||||||
|
|
||||||
thumbnail = self._search_regex(
|
_, category_id, subcategory_id, media_type, filepath = self._extract_player_info(
|
||||||
regex_pattern % '(?:I|i)mage',
|
video_id, webpage)
|
||||||
webpage, 'video title', default=None)
|
|
||||||
if thumbnail:
|
_, playlist_url, title, thumbnail = self._extract_fields_from_media_list(
|
||||||
if thumbnail.startswith('/'):
|
video_id, category_id, subcategory_id, media_type)
|
||||||
thumbnail = urljoin(base_url(url), thumbnail)
|
|
||||||
else:
|
formats = self._extract_formats(
|
||||||
thumbnail = self._og_search_thumbnail(webpage, default=None)
|
video_id, playlist_url, filepath)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
'display_id': display_id,
|
|
||||||
'title': title,
|
'title': title,
|
||||||
'description': description,
|
'description': description,
|
||||||
'thumbnail': thumbnail,
|
'thumbnail': thumbnail,
|
||||||
'formats': formats,
|
'formats': formats,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class VisirArticleIE(VisirBaseIE):
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?visir\.is/.+/article/(?P<id>\d+)$'
|
||||||
|
|
||||||
|
_TEST = {
|
||||||
|
'url': 'http://www.visir.is/landsmenn-minntust-birnu-brjansdottur/article/2017170128825',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '2017170128825',
|
||||||
|
'title': u'Landsmenn minntust Birnu Brjánsdóttur',
|
||||||
|
'description': u'Hundruð kerta voru tendruð á Arnarhóli í ljósaskiptunum í dag.'
|
||||||
|
},
|
||||||
|
'playlist_count': 2,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
article_id = self._match_id(url)
|
||||||
|
webpage = self._download_webpage(url, article_id)
|
||||||
|
|
||||||
|
title = remove_start(self._og_search_title(webpage), u'Vísir -').strip()
|
||||||
|
description = self._og_search_description(webpage, default=None)
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
|
||||||
|
# Try to find the main video of the article:
|
||||||
|
video_id, category_id, subcategory_id, media_type, _= self._extract_player_info(
|
||||||
|
article_id, webpage, default=None) # TODO: default?
|
||||||
|
if video_id and category_id and subcategory_id and media_type in ('video', 'audio'):
|
||||||
|
entries.append(self.url_result(
|
||||||
|
'visir:%s:%s:%s:%s' % (video_id, media_type, category_id, subcategory_id),
|
||||||
|
ie=VisirBaseIE.ie_key()))
|
||||||
|
|
||||||
|
# Try to find embedded visir videos:
|
||||||
|
video_urls = [m.group('url') for m in re.finditer(
|
||||||
|
r'<iframe[^>]+src=(["\'])(?P<url>http://www\.visir\.is/section/.+?)\1', webpage)]
|
||||||
|
for url in video_urls:
|
||||||
|
entries.append(self.url_result(url, ie=VisirMediaIE.ie_key()))
|
||||||
|
|
||||||
|
return self.playlist_result(
|
||||||
|
entries,
|
||||||
|
playlist_id=article_id,
|
||||||
|
playlist_title=title,
|
||||||
|
playlist_description=description)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user