From bca49659197427ff65bde2f1889d1a9fb3541b0a Mon Sep 17 00:00:00 2001 From: Roland Hieber Date: Sun, 21 May 2017 15:16:13 +0200 Subject: [PATCH 1/4] [wdr] update tests which fail due to HTTP 404 --- youtube_dl/extractor/wdr.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 8bb7362bb..7907a9b5a 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -103,34 +103,32 @@ class WDRIE(WDRBaseIE): _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', - # HDS download, MD5 is unstable + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/lokalzeit/video-lokalzeit-am-samstag-206.html', 'info_dict': { - 'id': 'mdb-1058683', - 'ext': 'flv', - 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', - 'title': 'Geheimnis Aachener Dom', - 'alt_title': 'Doku am Freitag', - 'upload_date': '20160304', - 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', + 'id': 'mdb-1378846', + 'ext': 'mp4', + 'display_id': 'lokalzeit/video-lokalzeit-am-samstag-206', + 'title': 'Lokalzeit am Samstag', + 'alt_title': 'Lokalzeit', + 'upload_date': '20170520', + 'description': 'md5:4a6785498658eabd870ada34dfd6580c', 'is_live': False, 'subtitles': {'de': [{ - 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml', + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/137/1378846/1378846_15999051.xml', 'ext': 'ttml', }]}, }, }, { - 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', - 'md5': 'f4c1f96d01cf285240f53ea4309663d8', + 'url': 'http://www1.wdr.de/mediathek/audio/wdr-aktuell/audio-in-duesseldorf-wollen-fdp-und-cdu-koalitionsverhandlungen-aufnehm-100.html', 'info_dict': { - 'id': 'mdb-1072000', + 'id': 'mdb-1378415', 'ext': 'mp3', - 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', - 'title': 'Schriftstellerin Juli Zeh', - 'alt_title': 'WDR 3 Gespräch am Samstag', - 'upload_date': '20160312', - 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', + 'display_id': 'audio-in-duesseldorf-wollen-fdp-und-cdu-koalitionsverhandlungen-aufnehm-100', + 'title': u'In D\u00fcsseldorf wollen FDP und CDU Koalitionsverhandlungen aufnehm', + 'alt_title': 'WDR Aktuell', + 'upload_date': '20170519', + 'description': 'md5:da9c9e242037b030fd3845b5e2e2068e', 'is_live': False, 'subtitles': {} }, From 943616590931062fd7a7884a160fff22b3378564 Mon Sep 17 00:00:00 2001 From: Roland Hieber Date: Sun, 21 May 2017 16:27:19 +0200 Subject: [PATCH 2/4] [wdr] refactor so that JSONP download is reusable We want to parse multiple JSONP URLs on a page later, so we have to change _html_search_regex() into a re.findall(). --- youtube_dl/extractor/wdr.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 7907a9b5a..b2b2c29ad 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -16,21 +16,29 @@ from ..utils import ( class WDRBaseIE(InfoExtractor): - def _extract_wdr_video(self, webpage, display_id): + def _extract_wdr_jsonp_urls(self, webpage, display_id): + """ returns list of jsonp urls """ # for wdr.de the data-extension is in a tag with the class "mediaLink" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdrmaus, in a tag with the class "videoButton" (previously a link # to the page in a multiline "videoLink"-tag) - json_metadata = self._html_search_regex( + json_metadata = re.findall( r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', - webpage, 'media link', default=None, flags=re.MULTILINE) + webpage, flags=re.MULTILINE) if not json_metadata: return - media_link_obj = self._parse_json(json_metadata, display_id, - transform_source=js_to_json) - jsonp_url = media_link_obj['mediaObj']['url'] + urls = [] + for json in json_metadata: + media_link_obj = self._parse_json(json, display_id, + transform_source=js_to_json) + urls.append(media_link_obj['mediaObj']['url']) + + return urls + + def _extract_wdr_video_from_jsonp_url(self, jsonp_url, display_id): + """ returns info dict """ metadata = self._download_json( jsonp_url, display_id, transform_source=strip_jsonp) @@ -199,9 +207,9 @@ class WDRIE(WDRBaseIE): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - info_dict = self._extract_wdr_video(webpage, display_id) + jsonp_urls = self._extract_wdr_jsonp_urls(webpage, display_id) - if not info_dict: + if not jsonp_urls: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( From 2437a7ae4eee9fb893d42f2dd67f31555a07807e Mon Sep 17 00:00:00 2001 From: Roland Hieber Date: Sun, 21 May 2017 16:31:27 +0200 Subject: [PATCH 3/4] [wdr] add support for wdrmaus.de content pages as playlists The content pages directly contain multiple video players without linking to a separate page for each video, like the Mediathek pages do. Therefore, we cannot delegate loading the info_dict from a "video URL" (because it does not exist), but instead we have to grab each video separately from their video player and load the info_dict from the respective JSONP. On the other hand, Mediathek pages only contain links to the separate video pages, and no JSONP URL, so we still need to support the old way of loading each video page separately when playing the playlist. --- youtube_dl/extractor/wdr.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index b2b2c29ad..d31866d60 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -105,9 +105,9 @@ class WDRBaseIE(InfoExtractor): class WDRIE(WDRBaseIE): - _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' + _WDR_MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?P(?:[^/]+/){1,4}[^/?#]+)\.php5' _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P[^/]+)/(?P.+)\.html' - _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _WDR_MAUS_REGEX _TESTS = [ { @@ -187,6 +187,13 @@ class WDRIE(WDRBaseIE): 'description': 'Die Seite mit der Maus -', }, }, + { + 'url': 'http://www.wdrmaus.de/extras/mausthemen/eisenbahn/index.php5', + 'playlist_mincount': 8, + 'info_dict': { + 'id': 'extras/mausthemen/eisenbahn/index', + }, + }, { 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', # Live stream, MD5 unstable @@ -210,6 +217,7 @@ class WDRIE(WDRBaseIE): jsonp_urls = self._extract_wdr_jsonp_urls(webpage, display_id) if not jsonp_urls: + # WDR Mediathek playlist pages contain links to the single video pages: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( @@ -222,6 +230,19 @@ class WDRIE(WDRBaseIE): raise ExtractorError('No downloadable streams found', expected=True) + elif len(jsonp_urls) > 1: + # wdrmaus playlist pages directly contain the JSONP URLs: + display_id = mobj.group('display_id_maus') + entries = [ + self._extract_wdr_video_from_jsonp_url(jsonp_url, display_id) + for jsonp_url in jsonp_urls + ] + return { '_type': 'playlist', 'entries': entries, 'id': display_id } + + else: + # page with a single video + info_dict = self._extract_wdr_video_from_jsonp_url(jsonp_urls[0], display_id) + is_live = url_type == 'live' if is_live: From 84aa791435fdce2919bef351b874dff0dc73b0a1 Mon Sep 17 00:00:00 2001 From: Roland Hieber Date: Sun, 21 May 2017 19:15:58 +0200 Subject: [PATCH 4/4] [wdr] fix failing test.test_unicode_literals.TestUnicodeLiterals --- youtube_dl/extractor/wdr.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index d31866d60..d1e1f721c 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -128,15 +128,15 @@ class WDRIE(WDRBaseIE): }, }, { - 'url': 'http://www1.wdr.de/mediathek/audio/wdr-aktuell/audio-in-duesseldorf-wollen-fdp-und-cdu-koalitionsverhandlungen-aufnehm-100.html', + 'url': 'http://www1.wdr.de/mediathek/audio/wdr5/wdr5-erlebte-geschichten/audio-dieter-rams-designer-mr-braun-100.html', 'info_dict': { - 'id': 'mdb-1378415', + 'id': 'mdb-1376845', 'ext': 'mp3', - 'display_id': 'audio-in-duesseldorf-wollen-fdp-und-cdu-koalitionsverhandlungen-aufnehm-100', - 'title': u'In D\u00fcsseldorf wollen FDP und CDU Koalitionsverhandlungen aufnehm', - 'alt_title': 'WDR Aktuell', - 'upload_date': '20170519', - 'description': 'md5:da9c9e242037b030fd3845b5e2e2068e', + 'display_id': 'wdr5-erlebte-geschichten/audio-dieter-rams-designer-mr-braun-100', + 'title': 'Dieter Rams, Designer "Mr. Braun"', + 'alt_title': 'WDR 5 Erlebte Geschichten', + 'upload_date': '20170521', + 'description': 'md5:0fd731f515ae4fb013b4323a4d7ea946', 'is_live': False, 'subtitles': {} },