From e7a33a4eb30c3eaebffd1af6a34a19c429eef565 Mon Sep 17 00:00:00 2001 From: runningbits Date: Mon, 3 Apr 2017 23:27:15 +0200 Subject: [PATCH 1/2] [spiegeltv] ability to extract correct title for embedded videos (/embed/ in path) These URLs are used on the spiegel.de main site when embedding short content from spiegel.tv. --- youtube_dl/extractor/spiegeltv.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index e1cfb8698..8163619ba 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -24,6 +24,19 @@ class SpiegeltvIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://www.spiegel.tv/filme/putins-trollfabriken/embed/?autoplay=true', + 'info_dict': { + 'id': 'putins-trollfabriken', + 'ext': 'm4v', + 'title': 'Putins Trollfabriken', + 'description': 'Propagandakrieg in den sozialen Medien', + 'thumbnail': r're:http://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, { 'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/', 'only_matching': True, @@ -34,7 +47,10 @@ class SpiegeltvIE(InfoExtractor): url = url.replace('/#/', '/') video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.*?)', webpage, 'title') + if '/embed/' not in url: + title = self._html_search_regex(r'(.*?)', webpage, 'title') + else: + title = self._html_search_regex(r'(.*?)(?:\s*\-\s* Embed)?', webpage, 'title') apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com' version_json = self._download_json( From e323cc688ecc8d67f0a6c8af36dd1d2b9468ef18 Mon Sep 17 00:00:00 2001 From: runningbits Date: Mon, 3 Apr 2017 23:27:28 +0200 Subject: [PATCH 2/2] [spiegel] use Spiegeltv extractor for embedded content on spiegel.de --- youtube_dl/extractor/spiegel.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index ec1b60388..afb752a45 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -14,7 +14,7 @@ from ..utils import ( class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?:video|sptv/spiegeltv)/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '2c2754212136f35fb4b19767d242f66e', @@ -47,6 +47,14 @@ class SpiegelIE(InfoExtractor): 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', 'upload_date': '20140904', } + }, { + 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-russische-propaganda-a-1136559.html', + 'info_dict': { + 'id': 'putins-trollfabriken', + 'ext': 'm4v', + 'description': 'Propagandakrieg in den sozialen Medien', + 'title': 'Putins Trollfabriken', + } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, @@ -55,10 +63,13 @@ class SpiegelIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage, handle = self._download_webpage_handle(url, video_id) + webpage_url = handle.geturl() + if 'spiegel.de/sptv/' in webpage_url: + webpage_url = self._search_regex(r']+src="([^"]+spiegel.tv/[^"]+)"[^>]+>', webpage, 'embedded iframe') # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') + if SpiegeltvIE.suitable(webpage_url): + return self.url_result(webpage_url, 'Spiegeltv') video_data = extract_attributes(self._search_regex(r'(]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) @@ -103,7 +114,7 @@ class SpiegelIE(InfoExtractor): class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!(?:video|sptv)/)[^?#]*?-(?P[0-9]+)\.html' IE_NAME = 'Spiegel:Article' IE_DESC = 'Articles on spiegel.de' _TESTS = [{