1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-13 11:50:00 +08:00

[telebasel] [simplex] Add new information extractors

This commit is contained in:
Alex Seiler 2017-02-06 17:01:34 +01:00
parent d5d904ff7d
commit 91d21e0a84
3 changed files with 366 additions and 0 deletions

View File

@ -849,6 +849,10 @@ from .shared import (
VivoIE,
)
from .showroomlive import ShowRoomLiveIE
from .simplex import (
SimplexIE,
SimplexHostsIE,
)
from .sina import SinaIE
from .sixplay import SixPlayIE
from .skynewsarabia import (
@ -931,6 +935,10 @@ from .teamfourstar import TeamFourStarIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tele13 import Tele13IE
from .telebasel import (
TelebaselMediathekIE,
TelebaselArticleIE,
)
from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
from .telegraaf import TelegraafIE

View File

@ -0,0 +1,233 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
str_or_none,
try_get,
urljoin,
)
class SimplexIE(InfoExtractor):
IE_DESC = 'Simplex Player'
_VALID_URL = r'''(?x)
simplex:
(?P<server_url>https?://(?:www\.)?.+):
(?P<customer_id>\d+):
(?P<author_id>\d+):
(?P<project_id>\d+)
'''
_TEST = {
'url': 'simplex:http://video.telebasel.ch:4062:4063:62349',
'only_matching': True,
}
@staticmethod
def _extract_width_height(resolution):
try:
w, h = resolution.split('x')
w = int_or_none(w)
h = int_or_none(h)
return w, h
except (AttributeError, ValueError):
return None, None
def _known_simplex_format(self, simplex_formats, fid):
for sf in simplex_formats:
if type(sf['id']) == str and sf['id'] == fid:
return sf
elif type(sf['id']) == list and fid in sf['id']:
return sf
return None
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
server_url = mobj.group('server_url')
customer_id = mobj.group('customer_id')
author_id = mobj.group('author_id')
project_id = mobj.group('project_id')
video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
content_url = urljoin(
server_url,
'content/%s/%s/%s/' % (customer_id, author_id, project_id))
player_data = self._download_json(
urljoin(content_url, 'data.sid'),
video_id,
note='Downloading player data JSON',
errnote='Unable to download player data JSON')
video_data = self._download_json(
urljoin(content_url, 'pl01.sid'),
video_id,
note='Downloading video data JSON',
errnote='Unable to download video data JSON',
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
title = str_or_none(player_data['title'])
description = str_or_none(player_data.get('description'))
timestamp = int_or_none(player_data.get('createDate'))
language = str_or_none(player_data.get('language'))
duration = float_or_none(player_data.get('duration'), scale=10)
file_information = try_get(video_data, lambda x: x['data'], dict)
if not file_information:
raise ExtractorError('Cannot extract file information data.')
filename = str_or_none(file_information.get('filename'))
thumbname = str_or_none(file_information.get('thumb'))
thumbnail = urljoin(content_url, thumbname + '.jpg') if thumbname else None
qualities = try_get(player_data, lambda x: x['qualities'], list)
if not qualities:
raise ExtractorError('Cannot find available formats.')
# simplex_formats is the list of known simplex player formats.
# There might be some more format ids, but we are not sure, what they do:
# id 400: It was indicated to be for Apple TV.
# id 500: No additional information found.
simplex_formats = [
{'id': '20', 'filename': filename + '.flv', 'method': 'url'},
{'id': '40', 'filename': filename + '_40.flv', 'method': 'url'},
{'id': '200', 'filename': filename + '.mp4', 'method': 'url'},
{'id': ['300', '350', '355', '360'], 'filename': 'index.m3u8', 'method': 'm3u8'},
]
formats = []
m3u8_done = False
format_infos = []
for quali in qualities:
fid = str_or_none(quali.get('id'))
vbr = int_or_none(quali.get('b'))
resolution = str_or_none(quali.get('s'))
width, height = SimplexIE._extract_width_height(resolution)
form_info = {
'resolution': resolution,
'width': width,
'height': height,
'vbr': vbr,
'abr': int_or_none(quali.get('ab')),
'asr': int_or_none(quali.get('ar')),
'fps': int_or_none(quali.get('r')),
'language': language,
'format_id': 'hls-%s' % str_or_none(vbr)
}
format_infos.append(form_info)
simplex_format = self._known_simplex_format(simplex_formats, fid)
if simplex_format:
format_url = urljoin(content_url, simplex_format['filename'])
if simplex_format['method'] == 'url':
form = {
'url': format_url
}
form.update(form_info)
formats.append(form)
elif simplex_format['method'] == 'm3u8' and not m3u8_done:
forms = self._extract_m3u8_formats(
format_url,
video_id,
ext='mp4',
entry_protocol='m3u8_native')
formats.extend(forms)
m3u8_done = True
# Try to add additional information to the formats exracted by _extract_m3u8_formats:
for form in formats:
if form['url'].endswith('.m3u8'):
vbr = int_or_none(
self._search_regex(r'(\d+)kb.m3u8', form['url'], 'm3u8 vbr', default=None))
if vbr:
try:
form_info = next(f for f in format_infos if f['vbr'] == vbr)
form.update(form_info)
except StopIteration:
pass
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'thumbnail': thumbnail,
'timestamp': timestamp,
'formats': formats,
}
class SimplexHostsIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?P<server_url>https?://(?:www\.)?
(?:
video\.telebasel\.ch|
media10\.simplex\.tv
)
)
/content/
(?P<customer_id>\d+)/
(?P<author_id>\d+)/
(?P<project_id>\d+)
'''
_TESTS = [{
'url': 'http://media10.simplex.tv/content/906/907/76997/',
'md5': 'e6b8ebefac5aeae4a6790fec18382ca0',
'info_dict': {
'id': '906-907-76997',
'ext': 'flv',
'title': '03.02.17: Der Trailer zum Rückrunden-Start',
'description': None,
'duration': 44.0,
'timestamp': 1486135964,
'upload_date': '20170203',
'url': 'http://media10.simplex.tv/content/906/907/76997/simvid_1_40.flv',
'thumbnail': 'http://media10.simplex.tv/content/906/907/76997/simvid_1.jpg',
'language': 'de',
'width': 1280,
'height': 720,
'vbr': 2304,
'abr': 160,
'fps': 25,
'asr': 44100,
'resolution': '1280x720'
}
}, {
'url': 'https://video.telebasel.ch/content/4062/4063/77067',
'info_dict': {
'id': '4062-4063-77067',
'ext': 'flv',
'title': 'News vom 05.02.2017',
'description': 'md5:23fb960068621263d5d4418996387674',
'timestamp': 1486314961,
'upload_date': '20170205',
},
'params': {
'skip_download': True,
}
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
server_url = mobj.group('server_url')
customer_id = mobj.group('customer_id')
author_id = mobj.group('author_id')
project_id = mobj.group('project_id')
video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
simplex_url = 'simplex:%s:%s:%s:%s' % (server_url, customer_id, author_id, project_id)
return self.url_result(
simplex_url,
ie=SimplexIE.ie_key(),
video_id=video_id)

View File

@ -0,0 +1,125 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .simplex import SimplexIE
from ..utils import (
ExtractorError,
str_or_none,
strip_or_none,
remove_end,
try_get,
urljoin,
)
class TelebaselBaseIE(InfoExtractor):
_SERVER_URL = 'https://video.telebasel.ch/'
_CUSTOMER_ID = '4062'
_AUTHOR_ID = '4063'
class TelebaselMediathekIE(TelebaselBaseIE):
IE_DESC = 'telebasel.ch Mediathek'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
telebasel\.ch/
(?!telebasel-archiv)
(?!\d+)
(?P<show_name>[^/]+)
(?:
/.*pid=(?P<pid>\d+).*
)?
'''
_TESTS = [{
'url': 'https://telebasel.ch/telebasel-gastro-tipp/?aid=4063&pid=75290&channel=15881',
'only_matching': True,
}, {
'url': 'https://telebasel.ch/telebasel-reihe-8',
'only_matching': True,
}, {
'url': 'https://telebasel.ch/telebasel-talk/?channel=15881',
'only_matching': True,
}]
def _extract_video_id(self, url, show_name):
webpage = self._download_webpage(url, show_name)
channel_id = self._html_search_regex(
r'<div[^>]+class=["\']tb-mediathek-videos["\'][^>]+data-channels=["\'](\d+)["\']',
webpage, 'channel id')
episodes_url = urljoin(
self._SERVER_URL,
'multichannel/%s/%s/.ofdd/json' % (self._CUSTOMER_ID, channel_id))
episodes = self._download_json(
episodes_url,
channel_id,
note='Downloading episodes JSON',
errnote='Unable to download episodes JSON',
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
video_id = str_or_none(
try_get(episodes, lambda x: x['projects'][0]['projectId'], int))
if not video_id:
raise ExtractorError('Could not extract video id from the webpage.')
return video_id
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
show_name = mobj.group('show_name')
video_id = mobj.group('pid')
if not video_id:
video_id = self._extract_video_id(url, show_name)
return self.url_result(
'simplex:%s:%s:%s:%s' % (
self._SERVER_URL, self._CUSTOMER_ID,
self._AUTHOR_ID, video_id),
ie=SimplexIE.ie_key())
class TelebaselArticleIE(TelebaselBaseIE):
IE_DESC = 'telebasel.ch articles'
_VALID_URL = r'https?://(?:www\.)?telebasel\.ch/(?P<id>\d{4}/\d{2}/\d{2}/[^/]+)/?'
_TEST = {
'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100',
'info_dict': {
'id': '2017/02/01/report-usr-iii-einfach-erklaert',
'title': 'Report: USR III einfach erklärt',
'description': 'md5:2cb2b94ac023a6a9517cffc58d500c7e',
},
'playlist_count': 3,
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
search_url = urljoin(
self._SERVER_URL,
r'content/%s/%s/(?P<pid>\d+)' % (self._CUSTOMER_ID, self._AUTHOR_ID))
embed_regex = r'<iframe[^>]+src=["\']%s.+["\']' % search_url
entries = [
self.url_result(
'simplex:%s:%s:%s:%s' % (
self._SERVER_URL, self._CUSTOMER_ID,
self._AUTHOR_ID, m.group('pid')),
ie=SimplexIE.ie_key())
for m in re.finditer(embed_regex, webpage)]
title = strip_or_none(
remove_end(self._og_search_title(webpage), '- Telebasel'))
description = self._og_search_description(webpage)
return self.playlist_result(
entries,
playlist_id=display_id,
playlist_title=title,
playlist_description=description)