From 91d21e0a8464f102e55a6711d69367a06c531184 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Mon, 6 Feb 2017 17:01:34 +0100 Subject: [PATCH 1/3] [telebasel] [simplex] Add new information extractors --- youtube_dl/extractor/extractors.py | 8 + youtube_dl/extractor/simplex.py | 233 +++++++++++++++++++++++++++++ youtube_dl/extractor/telebasel.py | 125 ++++++++++++++++ 3 files changed, 366 insertions(+) create mode 100644 youtube_dl/extractor/simplex.py create mode 100644 youtube_dl/extractor/telebasel.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 12cda36cc..caac397ef 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -849,6 +849,10 @@ from .shared import ( VivoIE, ) from .showroomlive import ShowRoomLiveIE +from .simplex import ( + SimplexIE, + SimplexHostsIE, +) from .sina import SinaIE from .sixplay import SixPlayIE from .skynewsarabia import ( @@ -931,6 +935,10 @@ from .teamfourstar import TeamFourStarIE from .techtalks import TechTalksIE from .ted import TEDIE from .tele13 import Tele13IE +from .telebasel import ( + TelebaselMediathekIE, + TelebaselArticleIE, +) from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE diff --git a/youtube_dl/extractor/simplex.py b/youtube_dl/extractor/simplex.py new file mode 100644 index 000000000..2f0ad013f --- /dev/null +++ b/youtube_dl/extractor/simplex.py @@ -0,0 +1,233 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + try_get, + urljoin, +) + + +class SimplexIE(InfoExtractor): + IE_DESC = 'Simplex Player' + _VALID_URL = r'''(?x) + simplex: + (?Phttps?://(?:www\.)?.+): + (?P\d+): + (?P\d+): + (?P\d+) + ''' + + _TEST = { + 'url': 'simplex:http://video.telebasel.ch:4062:4063:62349', + 'only_matching': True, + } + + @staticmethod + def _extract_width_height(resolution): + try: + w, h = resolution.split('x') + w = int_or_none(w) + h = int_or_none(h) + return w, h + except (AttributeError, ValueError): + return None, None + + def _known_simplex_format(self, simplex_formats, fid): + for sf in simplex_formats: + if type(sf['id']) == str and sf['id'] == fid: + return sf + elif type(sf['id']) == list and fid in sf['id']: + return sf + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + server_url = mobj.group('server_url') + customer_id = mobj.group('customer_id') + author_id = mobj.group('author_id') + project_id = mobj.group('project_id') + video_id = '%s-%s-%s' % (customer_id, author_id, project_id) + + content_url = urljoin( + server_url, + 'content/%s/%s/%s/' % (customer_id, author_id, project_id)) + + player_data = self._download_json( + urljoin(content_url, 'data.sid'), + video_id, + note='Downloading player data JSON', + errnote='Unable to download player data JSON') + video_data = self._download_json( + urljoin(content_url, 'pl01.sid'), + video_id, + note='Downloading video data JSON', + errnote='Unable to download video data JSON', + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + title = str_or_none(player_data['title']) + description = str_or_none(player_data.get('description')) + timestamp = int_or_none(player_data.get('createDate')) + language = str_or_none(player_data.get('language')) + duration = float_or_none(player_data.get('duration'), scale=10) + + file_information = try_get(video_data, lambda x: x['data'], dict) + if not file_information: + raise ExtractorError('Cannot extract file information data.') + + filename = str_or_none(file_information.get('filename')) + thumbname = str_or_none(file_information.get('thumb')) + thumbnail = urljoin(content_url, thumbname + '.jpg') if thumbname else None + + qualities = try_get(player_data, lambda x: x['qualities'], list) + if not qualities: + raise ExtractorError('Cannot find available formats.') + + # simplex_formats is the list of known simplex player formats. + # There might be some more format ids, but we are not sure, what they do: + # id 400: It was indicated to be for Apple TV. + # id 500: No additional information found. + simplex_formats = [ + {'id': '20', 'filename': filename + '.flv', 'method': 'url'}, + {'id': '40', 'filename': filename + '_40.flv', 'method': 'url'}, + {'id': '200', 'filename': filename + '.mp4', 'method': 'url'}, + {'id': ['300', '350', '355', '360'], 'filename': 'index.m3u8', 'method': 'm3u8'}, + ] + + formats = [] + + m3u8_done = False + format_infos = [] + for quali in qualities: + fid = str_or_none(quali.get('id')) + + vbr = int_or_none(quali.get('b')) + resolution = str_or_none(quali.get('s')) + width, height = SimplexIE._extract_width_height(resolution) + form_info = { + 'resolution': resolution, + 'width': width, + 'height': height, + 'vbr': vbr, + 'abr': int_or_none(quali.get('ab')), + 'asr': int_or_none(quali.get('ar')), + 'fps': int_or_none(quali.get('r')), + 'language': language, + 'format_id': 'hls-%s' % str_or_none(vbr) + } + format_infos.append(form_info) + + simplex_format = self._known_simplex_format(simplex_formats, fid) + if simplex_format: + format_url = urljoin(content_url, simplex_format['filename']) + if simplex_format['method'] == 'url': + form = { + 'url': format_url + } + form.update(form_info) + formats.append(form) + elif simplex_format['method'] == 'm3u8' and not m3u8_done: + forms = self._extract_m3u8_formats( + format_url, + video_id, + ext='mp4', + entry_protocol='m3u8_native') + formats.extend(forms) + m3u8_done = True + + # Try to add additional information to the formats exracted by _extract_m3u8_formats: + for form in formats: + if form['url'].endswith('.m3u8'): + vbr = int_or_none( + self._search_regex(r'(\d+)kb.m3u8', form['url'], 'm3u8 vbr', default=None)) + if vbr: + try: + form_info = next(f for f in format_infos if f['vbr'] == vbr) + form.update(form_info) + except StopIteration: + pass + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'formats': formats, + } + + +class SimplexHostsIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?Phttps?://(?:www\.)? + (?: + video\.telebasel\.ch| + media10\.simplex\.tv + ) + ) + /content/ + (?P\d+)/ + (?P\d+)/ + (?P\d+) + ''' + + _TESTS = [{ + 'url': 'http://media10.simplex.tv/content/906/907/76997/', + 'md5': 'e6b8ebefac5aeae4a6790fec18382ca0', + 'info_dict': { + 'id': '906-907-76997', + 'ext': 'flv', + 'title': '03.02.17: Der Trailer zum Rückrunden-Start', + 'description': None, + 'duration': 44.0, + 'timestamp': 1486135964, + 'upload_date': '20170203', + 'url': 'http://media10.simplex.tv/content/906/907/76997/simvid_1_40.flv', + 'thumbnail': 'http://media10.simplex.tv/content/906/907/76997/simvid_1.jpg', + 'language': 'de', + 'width': 1280, + 'height': 720, + 'vbr': 2304, + 'abr': 160, + 'fps': 25, + 'asr': 44100, + 'resolution': '1280x720' + } + }, { + 'url': 'https://video.telebasel.ch/content/4062/4063/77067', + 'info_dict': { + 'id': '4062-4063-77067', + 'ext': 'flv', + 'title': 'News vom 05.02.2017', + 'description': 'md5:23fb960068621263d5d4418996387674', + 'timestamp': 1486314961, + 'upload_date': '20170205', + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + server_url = mobj.group('server_url') + customer_id = mobj.group('customer_id') + author_id = mobj.group('author_id') + project_id = mobj.group('project_id') + + video_id = '%s-%s-%s' % (customer_id, author_id, project_id) + simplex_url = 'simplex:%s:%s:%s:%s' % (server_url, customer_id, author_id, project_id) + + return self.url_result( + simplex_url, + ie=SimplexIE.ie_key(), + video_id=video_id) diff --git a/youtube_dl/extractor/telebasel.py b/youtube_dl/extractor/telebasel.py new file mode 100644 index 000000000..2498b2480 --- /dev/null +++ b/youtube_dl/extractor/telebasel.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .simplex import SimplexIE +from ..utils import ( + ExtractorError, + str_or_none, + strip_or_none, + remove_end, + try_get, + urljoin, +) + + +class TelebaselBaseIE(InfoExtractor): + _SERVER_URL = 'https://video.telebasel.ch/' + _CUSTOMER_ID = '4062' + _AUTHOR_ID = '4063' + + +class TelebaselMediathekIE(TelebaselBaseIE): + IE_DESC = 'telebasel.ch Mediathek' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + telebasel\.ch/ + (?!telebasel-archiv) + (?!\d+) + (?P[^/]+) + (?: + /.*pid=(?P\d+).* + )? + ''' + + _TESTS = [{ + 'url': 'https://telebasel.ch/telebasel-gastro-tipp/?aid=4063&pid=75290&channel=15881', + 'only_matching': True, + }, { + 'url': 'https://telebasel.ch/telebasel-reihe-8', + 'only_matching': True, + }, { + 'url': 'https://telebasel.ch/telebasel-talk/?channel=15881', + 'only_matching': True, + }] + + def _extract_video_id(self, url, show_name): + webpage = self._download_webpage(url, show_name) + channel_id = self._html_search_regex( + r']+class=["\']tb-mediathek-videos["\'][^>]+data-channels=["\'](\d+)["\']', + webpage, 'channel id') + + episodes_url = urljoin( + self._SERVER_URL, + 'multichannel/%s/%s/.ofdd/json' % (self._CUSTOMER_ID, channel_id)) + episodes = self._download_json( + episodes_url, + channel_id, + note='Downloading episodes JSON', + errnote='Unable to download episodes JSON', + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) + + video_id = str_or_none( + try_get(episodes, lambda x: x['projects'][0]['projectId'], int)) + if not video_id: + raise ExtractorError('Could not extract video id from the webpage.') + + return video_id + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + show_name = mobj.group('show_name') + video_id = mobj.group('pid') + + if not video_id: + video_id = self._extract_video_id(url, show_name) + + return self.url_result( + 'simplex:%s:%s:%s:%s' % ( + self._SERVER_URL, self._CUSTOMER_ID, + self._AUTHOR_ID, video_id), + ie=SimplexIE.ie_key()) + + +class TelebaselArticleIE(TelebaselBaseIE): + IE_DESC = 'telebasel.ch articles' + _VALID_URL = r'https?://(?:www\.)?telebasel\.ch/(?P\d{4}/\d{2}/\d{2}/[^/]+)/?' + + _TEST = { + 'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100', + 'info_dict': { + 'id': '2017/02/01/report-usr-iii-einfach-erklaert', + 'title': 'Report: USR III einfach erklärt', + 'description': 'md5:2cb2b94ac023a6a9517cffc58d500c7e', + }, + 'playlist_count': 3, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + search_url = urljoin( + self._SERVER_URL, + r'content/%s/%s/(?P\d+)' % (self._CUSTOMER_ID, self._AUTHOR_ID)) + embed_regex = r']+src=["\']%s.+["\']' % search_url + entries = [ + self.url_result( + 'simplex:%s:%s:%s:%s' % ( + self._SERVER_URL, self._CUSTOMER_ID, + self._AUTHOR_ID, m.group('pid')), + ie=SimplexIE.ie_key()) + for m in re.finditer(embed_regex, webpage)] + + title = strip_or_none( + remove_end(self._og_search_title(webpage), '- Telebasel')) + description = self._og_search_description(webpage) + + return self.playlist_result( + entries, + playlist_id=display_id, + playlist_title=title, + playlist_description=description) From 36144cfe1b47b03a0241a906fc8d5737abe317c8 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Mon, 6 Feb 2017 18:13:05 +0100 Subject: [PATCH 2/3] [telebasel] [simplex] Handle Telebasel articles in the generic information extractor. --- youtube_dl/extractor/extractors.py | 5 +-- youtube_dl/extractor/generic.py | 18 +++++++++- youtube_dl/extractor/simplex.py | 7 ++++ youtube_dl/extractor/telebasel.py | 54 +++--------------------------- 4 files changed, 29 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index caac397ef..3c15f2678 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -935,10 +935,7 @@ from .teamfourstar import TeamFourStarIE from .techtalks import TechTalksIE from .ted import TEDIE from .tele13 import Tele13IE -from .telebasel import ( - TelebaselMediathekIE, - TelebaselArticleIE, -) +from .telebasel import TelebaselIE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c233f038..7e9345a4e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -83,6 +83,7 @@ from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE from .videopress import VideoPressIE +from .simplex import SimplexIE class GenericIE(InfoExtractor): @@ -1499,10 +1500,19 @@ class GenericIE(InfoExtractor): 'timestamp': 1435711927, 'upload_date': '20150701', }, + 'add_ie': [VideoPressIE.ie_key()], + }, + { + # Simplex embed + 'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100', + 'info_dict': { + 'id': '?channel=105100', + 'title': 'Report: USR III einfach erklärt - Telebasel', + }, 'params': { 'skip_download': True, }, - 'add_ie': [VideoPressIE.ie_key()], + 'playlist_count': 3, } # { # # TODO: find another test @@ -2474,6 +2484,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( videopress_urls, ie=VideoPressIE.ie_key()) + # Look for Simplex embeds + simplex_urls = SimplexIE._extract_urls(webpage) + if simplex_urls: + return _playlist_from_matches( + simplex_urls, ie=SimplexIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/simplex.py b/youtube_dl/extractor/simplex.py index 2f0ad013f..7c5c67a48 100644 --- a/youtube_dl/extractor/simplex.py +++ b/youtube_dl/extractor/simplex.py @@ -29,6 +29,13 @@ class SimplexIE(InfoExtractor): 'only_matching': True, } + @staticmethod + def _extract_urls(webpage): + return ['simplex:%s:%s:%s:%s' % ( + m.group('server_url'), m.group('customer_id'), + m.group('author_id'), m.group('project_id')) + for m in re.finditer(r']+src=["\']%s.+["\']' % SimplexHostsIE._VALID_URL, webpage)] + @staticmethod def _extract_width_height(resolution): try: diff --git a/youtube_dl/extractor/telebasel.py b/youtube_dl/extractor/telebasel.py index 2498b2480..f9d12b780 100644 --- a/youtube_dl/extractor/telebasel.py +++ b/youtube_dl/extractor/telebasel.py @@ -8,20 +8,12 @@ from .simplex import SimplexIE from ..utils import ( ExtractorError, str_or_none, - strip_or_none, - remove_end, try_get, urljoin, ) -class TelebaselBaseIE(InfoExtractor): - _SERVER_URL = 'https://video.telebasel.ch/' - _CUSTOMER_ID = '4062' - _AUTHOR_ID = '4063' - - -class TelebaselMediathekIE(TelebaselBaseIE): +class TelebaselIE(InfoExtractor): IE_DESC = 'telebasel.ch Mediathek' _VALID_URL = r'''(?x) https?:// @@ -34,6 +26,9 @@ class TelebaselMediathekIE(TelebaselBaseIE): /.*pid=(?P\d+).* )? ''' + _SERVER_URL = 'https://video.telebasel.ch/' + _CUSTOMER_ID = '4062' + _AUTHOR_ID = '4063' _TESTS = [{ 'url': 'https://telebasel.ch/telebasel-gastro-tipp/?aid=4063&pid=75290&channel=15881', @@ -82,44 +77,3 @@ class TelebaselMediathekIE(TelebaselBaseIE): self._SERVER_URL, self._CUSTOMER_ID, self._AUTHOR_ID, video_id), ie=SimplexIE.ie_key()) - - -class TelebaselArticleIE(TelebaselBaseIE): - IE_DESC = 'telebasel.ch articles' - _VALID_URL = r'https?://(?:www\.)?telebasel\.ch/(?P\d{4}/\d{2}/\d{2}/[^/]+)/?' - - _TEST = { - 'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100', - 'info_dict': { - 'id': '2017/02/01/report-usr-iii-einfach-erklaert', - 'title': 'Report: USR III einfach erklärt', - 'description': 'md5:2cb2b94ac023a6a9517cffc58d500c7e', - }, - 'playlist_count': 3, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - search_url = urljoin( - self._SERVER_URL, - r'content/%s/%s/(?P\d+)' % (self._CUSTOMER_ID, self._AUTHOR_ID)) - embed_regex = r']+src=["\']%s.+["\']' % search_url - entries = [ - self.url_result( - 'simplex:%s:%s:%s:%s' % ( - self._SERVER_URL, self._CUSTOMER_ID, - self._AUTHOR_ID, m.group('pid')), - ie=SimplexIE.ie_key()) - for m in re.finditer(embed_regex, webpage)] - - title = strip_or_none( - remove_end(self._og_search_title(webpage), '- Telebasel')) - description = self._og_search_description(webpage) - - return self.playlist_result( - entries, - playlist_id=display_id, - playlist_title=title, - playlist_description=description) From 788fcd4585432663480a23a83905ff6f8d91c866 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Tue, 7 Feb 2017 05:00:33 +0100 Subject: [PATCH 3/3] [simplex] Fix format extraction for python 2.6 and 2.7 --- youtube_dl/extractor/simplex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/simplex.py b/youtube_dl/extractor/simplex.py index 7c5c67a48..dd5e0e101 100644 --- a/youtube_dl/extractor/simplex.py +++ b/youtube_dl/extractor/simplex.py @@ -48,7 +48,7 @@ class SimplexIE(InfoExtractor): def _known_simplex_format(self, simplex_formats, fid): for sf in simplex_formats: - if type(sf['id']) == str and sf['id'] == fid: + if sf['id'] == fid: return sf elif type(sf['id']) == list and fid in sf['id']: return sf