From ad9f998e29a1fae216df4a3b69789e2f2a19c6ca Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Sat, 12 Mar 2016 18:00:26 +0100 Subject: [PATCH] [WDR] complete overhaul after relaunch of the site The WDR relaunched their site on 2016-02-23 which not only changed the URL-schema completely but also the layout of their pages. Apparently the whole "mediathek" now runs on the wdr-domain, so no separate URL for funkhauseuropa anymore. There seems to be no explicit handling of video-sizes on the page or in the URLs anymore. There seems to be only one size for HTML5, but still several sizes for flash. The extractor adds all to the list of formats. There is no metadata for the HTML5-stream, so that the best flash-stream will always be considered as the "best" format. At least in my tests this seemed to be true anyway. --- youtube_dl/extractor/wdr.py | 251 +++++++++++++++--------------------- 1 file changed, 101 insertions(+), 150 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index a851578e0..c73bc225c 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import itertools import re from .common import InfoExtractor @@ -11,204 +10,156 @@ from ..compat import ( ) from ..utils import ( unified_strdate, - qualities, + ExtractorError, ) class WDRIE(InfoExtractor): - _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?' - _VALID_URL = r'(?Phttps?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)(?P%s)?\.html' % _PLAYER_REGEX + _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' + _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + + _JS_URL_REGEX = r'(http://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)' _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', + 'md5': 'e58c39c3e30077141d258bf588700a7b', 'info_dict': { - 'id': 'mdb-362427', + 'id': 'mdb-1058683', 'ext': 'flv', - 'title': 'Servicezeit', - 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', - 'upload_date': '20140310', - 'is_live': False - }, - 'params': { - 'skip_download': True, + 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', + 'title': 'Geheimnis Aachener Dom', + 'alt_title': 'Doku am Freitag', + 'upload_date': '20160304', + 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', + 'is_live': False, + 'subtitles': {'de': [{ + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml' + }]}, }, 'skip': 'Page Not Found', }, { - 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', + 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', + 'md5': 'f4c1f96d01cf285240f53ea4309663d8', 'info_dict': { - 'id': 'mdb-363194', + 'id': 'mdb-1072000', + 'ext': 'mp3', + 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', + 'title': 'Schriftstellerin Juli Zeh', + 'alt_title': 'WDR 3 Gespräch am Samstag', + 'upload_date': '20160312', + 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', + 'is_live': False, + 'subtitles': {} + }, + 'skip': 'Page Not Found', + }, + { + 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', + 'info_dict': { + 'id': 'mdb-103364', 'ext': 'flv', - 'title': 'Marga Spiegel ist tot', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20140311', - 'is_live': False - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Page Not Found', - }, - { - 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', - 'md5': '83e9e8fefad36f357278759870805898', - 'info_dict': { - 'id': 'mdb-194332', - 'ext': 'mp3', - 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20091129', - 'is_live': False - }, - }, - { - 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', - 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', - 'info_dict': { - 'id': 'mdb-478135', - 'ext': 'mp3', - 'title': 'Flavia Coelho: Amar é Amar', - 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140717', - 'is_live': False - }, - 'skip': 'Page Not Found', - }, - { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', - 'playlist_mincount': 146, - 'info_dict': { - 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', + 'display_id': 'index', + 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'WDR Fernsehen Live', + 'upload_date': None, + 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', + 'is_live': True, + 'subtitles': {} } }, { - 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', + 'playlist_mincount': 10, 'info_dict': { - 'id': 'mdb-103364', - 'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', - 'ext': 'flv', - 'upload_date': '20150101', - 'is_live': True - }, - 'params': { - 'skip_download': True, + 'id': 'aktuelle-stunde/aktuelle-stunde-120', }, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_url = mobj.group('url') - page_id = mobj.group('id') + url_type = mobj.group('type') + page_url = mobj.group('page_url') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) - webpage = self._download_webpage(url, page_id) + js_url = self._search_regex(self._JS_URL_REGEX, webpage, 'js_url', default=None) - if mobj.group('player') is None: + if not js_url: entries = [ - self.url_result(page_url + href, 'WDR') + self.url_result(page_url + href[0], 'WDR') for href in re.findall( - r'\s*]*>\s*\s*]+href="([^"]+)"', - webpage, 'm3u8 url', default=None) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, page_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - direct_urls = re.findall( - r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) - if direct_urls: - for quality, video_url in direct_urls: - formats.append({ - 'url': video_url, - 'preference': preference(quality), - 'http_headers': { - 'User-Agent': 'mobile', - }, - }) - self._sort_formats(formats) - description = self._html_search_meta('Description', webpage, 'description') - return { - 'id': page_id, - 'formats': formats, + 'id': metadata_tracker_data.get("trackerClipId", display_id), + 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'alt_title': metadata_tracker_data.get("trackerClipSubcategory"), + 'formats': formats, 'upload_date': upload_date, - 'is_live': is_live + 'description': self._html_search_meta("Description", webpage), + 'is_live': is_live, + 'subtitles': subtitles, }