diff --git a/youtube_dl/extractor/welt.py b/youtube_dl/extractor/welt.py index 38a5bac54..f6219080f 100644 --- a/youtube_dl/extractor/welt.py +++ b/youtube_dl/extractor/welt.py @@ -4,15 +4,23 @@ import re from .common import InfoExtractor +from ..utils import ( + remove_end, + int_or_none +) + + class WeltIE(InfoExtractor): IE_NAME = 'welt.de' - _VALID_URL = r'''https?://(?:www\.)?welt\.de/mediathek/dokumentation/.*sendung(?P\d+)/.*''' + _VALID_URL = r'''https?://(?:www\.)?welt\.de/[^\.]+/(?P[a-z]+\d+)(?:/.*)?''' _TESTS = [ { - 'url': 'https://www.welt.de/mediathek/dokumentation/space/sendung170058475/ISS-Leben-auf-der-Weltraumstation.html', + # All videos have a predefined lifetime, usually just 30-45 days + 'url': 'https://www.welt.de/mediathek/dokumentation/natur-und-wildlife/sendung157940018/Mega-Croc-vs-Superschlange.html', 'info_dict': { + 'id': 'sendung157940018', 'ext': 'mp4', - 'title': 'ISS - Leben auf der Weltraumstation', + 'title': 'Mega Croc vs. Superschlange', } } ] @@ -20,12 +28,29 @@ class WeltIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'([^<]+)', webpage, 'title') - video_url = self._html_search_regex(r'([^<]+)', webpage, 'title') + .strip(), ' - Video - WELT') - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', + sources = self._search_regex(r'(["\'])?sources\1\s*:\s*\[({\s*\1file\1\s*:\s*\1([^\1,])+\1\s*}\s*,?)+', webpage, 'sources', group=0) + files = re.findall(r'http[^\'"]+', sources) + + formats = [] + for url in files: + number = re.search(r'_(\d+)\.mp4', url).group(1) + formats.append({ + 'url': url, + 'quality_key': int_or_none(number) + }) + self._remove_duplicate_formats(formats) + formats = sorted(formats, key=lambda x: x['quality_key']) + quality_counter = -1 + for i in range(len(formats) - 1, 0, -1): + formats[i] = {'url': formats[i]['url'], 'quality': quality_counter} + quality_counter -= 1 + + return { + 'id': video_id, + 'ext': 'mp4', 'title': title, - }] + 'formats': formats + }