From 36144cfe1b47b03a0241a906fc8d5737abe317c8 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Mon, 6 Feb 2017 18:13:05 +0100 Subject: [PATCH] [telebasel] [simplex] Handle Telebasel articles in the generic information extractor. --- youtube_dl/extractor/extractors.py | 5 +-- youtube_dl/extractor/generic.py | 18 +++++++++- youtube_dl/extractor/simplex.py | 7 ++++ youtube_dl/extractor/telebasel.py | 54 +++--------------------------- 4 files changed, 29 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index caac397ef..3c15f2678 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -935,10 +935,7 @@ from .teamfourstar import TeamFourStarIE from .techtalks import TechTalksIE from .ted import TEDIE from .tele13 import Tele13IE -from .telebasel import ( - TelebaselMediathekIE, - TelebaselArticleIE, -) +from .telebasel import TelebaselIE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c233f038..7e9345a4e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -83,6 +83,7 @@ from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE from .videopress import VideoPressIE +from .simplex import SimplexIE class GenericIE(InfoExtractor): @@ -1499,10 +1500,19 @@ class GenericIE(InfoExtractor): 'timestamp': 1435711927, 'upload_date': '20150701', }, + 'add_ie': [VideoPressIE.ie_key()], + }, + { + # Simplex embed + 'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100', + 'info_dict': { + 'id': '?channel=105100', + 'title': 'Report: USR III einfach erklärt - Telebasel', + }, 'params': { 'skip_download': True, }, - 'add_ie': [VideoPressIE.ie_key()], + 'playlist_count': 3, } # { # # TODO: find another test @@ -2474,6 +2484,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( videopress_urls, ie=VideoPressIE.ie_key()) + # Look for Simplex embeds + simplex_urls = SimplexIE._extract_urls(webpage) + if simplex_urls: + return _playlist_from_matches( + simplex_urls, ie=SimplexIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/simplex.py b/youtube_dl/extractor/simplex.py index 2f0ad013f..7c5c67a48 100644 --- a/youtube_dl/extractor/simplex.py +++ b/youtube_dl/extractor/simplex.py @@ -29,6 +29,13 @@ class SimplexIE(InfoExtractor): 'only_matching': True, } + @staticmethod + def _extract_urls(webpage): + return ['simplex:%s:%s:%s:%s' % ( + m.group('server_url'), m.group('customer_id'), + m.group('author_id'), m.group('project_id')) + for m in re.finditer(r']+src=["\']%s.+["\']' % SimplexHostsIE._VALID_URL, webpage)] + @staticmethod def _extract_width_height(resolution): try: diff --git a/youtube_dl/extractor/telebasel.py b/youtube_dl/extractor/telebasel.py index 2498b2480..f9d12b780 100644 --- a/youtube_dl/extractor/telebasel.py +++ b/youtube_dl/extractor/telebasel.py @@ -8,20 +8,12 @@ from .simplex import SimplexIE from ..utils import ( ExtractorError, str_or_none, - strip_or_none, - remove_end, try_get, urljoin, ) -class TelebaselBaseIE(InfoExtractor): - _SERVER_URL = 'https://video.telebasel.ch/' - _CUSTOMER_ID = '4062' - _AUTHOR_ID = '4063' - - -class TelebaselMediathekIE(TelebaselBaseIE): +class TelebaselIE(InfoExtractor): IE_DESC = 'telebasel.ch Mediathek' _VALID_URL = r'''(?x) https?:// @@ -34,6 +26,9 @@ class TelebaselMediathekIE(TelebaselBaseIE): /.*pid=(?P\d+).* )? ''' + _SERVER_URL = 'https://video.telebasel.ch/' + _CUSTOMER_ID = '4062' + _AUTHOR_ID = '4063' _TESTS = [{ 'url': 'https://telebasel.ch/telebasel-gastro-tipp/?aid=4063&pid=75290&channel=15881', @@ -82,44 +77,3 @@ class TelebaselMediathekIE(TelebaselBaseIE): self._SERVER_URL, self._CUSTOMER_ID, self._AUTHOR_ID, video_id), ie=SimplexIE.ie_key()) - - -class TelebaselArticleIE(TelebaselBaseIE): - IE_DESC = 'telebasel.ch articles' - _VALID_URL = r'https?://(?:www\.)?telebasel\.ch/(?P\d{4}/\d{2}/\d{2}/[^/]+)/?' - - _TEST = { - 'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100', - 'info_dict': { - 'id': '2017/02/01/report-usr-iii-einfach-erklaert', - 'title': 'Report: USR III einfach erklärt', - 'description': 'md5:2cb2b94ac023a6a9517cffc58d500c7e', - }, - 'playlist_count': 3, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - search_url = urljoin( - self._SERVER_URL, - r'content/%s/%s/(?P\d+)' % (self._CUSTOMER_ID, self._AUTHOR_ID)) - embed_regex = r']+src=["\']%s.+["\']' % search_url - entries = [ - self.url_result( - 'simplex:%s:%s:%s:%s' % ( - self._SERVER_URL, self._CUSTOMER_ID, - self._AUTHOR_ID, m.group('pid')), - ie=SimplexIE.ie_key()) - for m in re.finditer(embed_regex, webpage)] - - title = strip_or_none( - remove_end(self._og_search_title(webpage), '- Telebasel')) - description = self._og_search_description(webpage) - - return self.playlist_result( - entries, - playlist_id=display_id, - playlist_title=title, - playlist_description=description)