From cbb5155060f597fd1e2fb54defd76bc97e94267b Mon Sep 17 00:00:00 2001 From: Niklas Date: Tue, 2 Oct 2018 18:21:48 +0200 Subject: [PATCH 01/15] Merge TTML subtitle cues with same timecodes while converting to SRT --- youtube_dl/utils.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e84d35d4d..17ea7bc09 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2831,6 +2831,9 @@ def dfxp2srt(dfxp_data): continue default_style.update(style) + last_begin_time = None + last_end_time = None + for para, index in zip(paras, itertools.count(1)): begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) @@ -2841,12 +2844,20 @@ def dfxp2srt(dfxp_data): if not dur: continue end_time = begin_time + dur - out.append('%d\n%s --> %s\n%s\n\n' % ( - index, - srt_subtitles_timecode(begin_time), - srt_subtitles_timecode(end_time), - parse_node(para))) + if begin_time == last_begin_time and end_time == last_end_time: + out.append('%s\n' % (parse_node(para))) + else: + out.append('\n%d\n%s --> %s\n%s\n' % ( + index, + srt_subtitles_timecode(begin_time), + srt_subtitles_timecode(end_time), + parse_node(para))) + + last_begin_time = begin_time + last_end_time = end_time + + out.append('\n') return ''.join(out) From 405947c32e6975c854e28e69836393b275e88539 Mon Sep 17 00:00:00 2001 From: Niklas Date: Tue, 2 Oct 2018 19:10:33 +0200 Subject: [PATCH 02/15] Reflect merged cues in indexes --- youtube_dl/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 17ea7bc09..c9586e9b1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2833,6 +2833,7 @@ def dfxp2srt(dfxp_data): last_begin_time = None last_end_time = None + index_offset = 0 for para, index in zip(paras, itertools.count(1)): begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) @@ -2846,10 +2847,10 @@ def dfxp2srt(dfxp_data): end_time = begin_time + dur if begin_time == last_begin_time and end_time == last_end_time: + index_offset += 1 out.append('%s\n' % (parse_node(para))) else: - out.append('\n%d\n%s --> %s\n%s\n' % ( - index, + index - index_offset, srt_subtitles_timecode(begin_time), srt_subtitles_timecode(end_time), parse_node(para))) From 7ecd95aa9164ad47252c6fa134692f3d9e3e8d49 Mon Sep 17 00:00:00 2001 From: Niklas Date: Tue, 2 Oct 2018 19:11:12 +0200 Subject: [PATCH 03/15] Don't add preceding newline when converting TTML to SRT --- youtube_dl/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c9586e9b1..cec206742 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2850,6 +2850,9 @@ def dfxp2srt(dfxp_data): index_offset += 1 out.append('%s\n' % (parse_node(para))) else: + if out: + out.append('\n') + out.append('%d\n%s --> %s\n%s\n' % ( index - index_offset, srt_subtitles_timecode(begin_time), srt_subtitles_timecode(end_time), From 582e1d10e7ee52bd177fca3298792c329e22340f Mon Sep 17 00:00:00 2001 From: Niklas Date: Tue, 2 Oct 2018 20:33:40 +0200 Subject: [PATCH 04/15] Update TTML->SRT conversion test --- test/test_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9e28e008f..589a1c2bb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1196,18 +1196,15 @@ The first line srt_data = '''1 00:00:02,080 --> 00:00:05,839 default stylecustom style - -2 -00:00:02,080 --> 00:00:05,839 part 1 part 2 -3 +2 00:00:05,839 --> 00:00:09,560 line 3 part 3 -4 +3 00:00:09,560 --> 00:00:12,359 inner style From 15a5856e9d849d7967c05a4e45a0954cda448954 Mon Sep 17 00:00:00 2001 From: Niklas Date: Wed, 3 Oct 2018 15:21:22 +0200 Subject: [PATCH 05/15] [ard] ARDIE: support subtitles --- youtube_dl/extractor/ard.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf8f61eb..344244743 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -13,6 +13,7 @@ from ..utils import ( parse_duration, unified_strdate, xpath_text, + xpath_attr, update_url_query, url_or_none, ) @@ -257,6 +258,12 @@ class ARDIE(InfoExtractor): video_node, './broadcastDate')) thumbnail = xpath_text(video_node, './/teaserImage//variant/url') + subtitles = [] + for variant, ext in (('dataTimedTextNoOffset', 'ttml'), ('dataTimedTextVtt', 'vtt')): + url = xpath_attr(video_node, './%s' % variant, 'url') + if url: + subtitles.append({'ext': ext, 'url': url}) + formats = [] for a in video_node.findall('.//asset'): f = { @@ -279,6 +286,7 @@ class ARDIE(InfoExtractor): return { 'id': mobj.group('id'), 'formats': formats, + 'subtitles': {'de': subtitles}, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), From 4ad89eb36ba553b2fbd55f4678e53dbd91ba45c0 Mon Sep 17 00:00:00 2001 From: Niklas Date: Fri, 5 Oct 2018 00:30:15 +0200 Subject: [PATCH 06/15] maintain alphabetic order --- youtube_dl/extractor/ard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 344244743..ed53093b9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -12,8 +12,8 @@ from ..utils import ( int_or_none, parse_duration, unified_strdate, - xpath_text, xpath_attr, + xpath_text, update_url_query, url_or_none, ) From 0434e3e22f746ed85b1fad48153ac7ace5fc57ac Mon Sep 17 00:00:00 2001 From: Niklas Date: Fri, 5 Oct 2018 00:32:16 +0200 Subject: [PATCH 07/15] [ard] prevent empty list for a key --- youtube_dl/extractor/ard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index ed53093b9..b3e604587 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -286,7 +286,7 @@ class ARDIE(InfoExtractor): return { 'id': mobj.group('id'), 'formats': formats, - 'subtitles': {'de': subtitles}, + 'subtitles': {'de': subtitles} if subtitles else None, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), From 1135a30970185b04983002fa7b2c572f8647d656 Mon Sep 17 00:00:00 2001 From: Niklas Date: Tue, 9 Oct 2018 19:09:45 +0200 Subject: [PATCH 08/15] [ard] fix ARDIE url regex --- youtube_dl/extractor/ard.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf8f61eb..4ec8a6296 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -227,7 +227,7 @@ class ARDMediathekIE(InfoExtractor): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(-folgen-verpasst)?/(?P[^/?#]+)-(?P[0-9]+))\.html' _TESTS = [{ # available till 14.02.2019 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', @@ -244,6 +244,9 @@ class ARDIE(InfoExtractor): }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/serie/babylon-berlin/videos-folgen-verpasst/Babylon_Berlin_1-folge-100.html', + 'only_matching': True, }] def _real_extract(self, url): From 18dda9e7b4c2eb2bcd21cf6a14426167c9b6b228 Mon Sep 17 00:00:00 2001 From: Niklas Date: Wed, 10 Oct 2018 02:18:13 +0200 Subject: [PATCH 09/15] [ard] ARDBetaMediathek: don't abort when geoblocked This doesn't work since the flag is always present as long as the video is geoblocked in any country. --- youtube_dl/extractor/ard.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 4ec8a6296..e437e297f 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -322,9 +322,6 @@ class ARDBetaMediathekIE(InfoExtractor): } formats = [] for widget in data.values(): - if widget.get('_geoblocked'): - raise ExtractorError('This video is not available due to geoblocking', expected=True) - if '_duration' in widget: res['duration'] = widget['_duration'] if 'clipTitle' in widget: From de352ca7499daa2762c13f1957b9b51911f88342 Mon Sep 17 00:00:00 2001 From: Niklas Date: Wed, 10 Oct 2018 03:16:53 +0200 Subject: [PATCH 10/15] [ard] rework format extraction (fixes #17744) --- youtube_dl/extractor/ard.py | 219 ++++++++++++++++++++++++++++++------ 1 file changed, 182 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index e437e297f..72bf8eaf6 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -19,13 +19,105 @@ from ..utils import ( from ..compat import compat_etree_fromstring -class ARDMediathekIE(InfoExtractor): +class ARDBaseIE(InfoExtractor): + # Format information is valid for all media saved on ARD's Akamai servers. + # IMPORTANT: Not valid for media from WDR/BR/etc. servers + + _ALL_FORMATS = { + 'vcodec': 'H.264', + 'acodec': 'LC-AAC', + } + + _FORMATS = { + 320: { + 'width': 320, + 'height': 188, + 'vbr': 128, + 'abr': 61, + }, + 480: { + 'width': 480, + 'height': 270, + 'vbr': 256, + 'abr': 61, + }, + 512: { + 'width': 512, + 'height': 288, + 'vbr': 512, + 'abr': 94, + }, + 640: { + 'width': 640, + 'height': 360, + 'vbr': 1024, + 'abr': 192, + }, + 960: { + 'width': 960, + 'height': 540, + 'vbr': 1800, + 'abr': 192, + }, + 1280: { + 'width': 1280, + 'height': 720, + 'vbr': 3584, + 'abr': 192, + }, + } + + def _check_additional_formats(self, formats, video_id): + urls = {} + for format in formats: + url = format['url'] + if not url.endswith('.mp4'): + continue + base_url = url.rsplit('/', 1)[0] + m = re.search(r'.*/([0-9]+)-[0-9]\..*$', url) + if not m: + continue + width = int_or_none(m.group(1)) + if base_url in urls and width in urls[base_url]: + continue + if base_url not in urls: + urls[base_url] = [width] + elif width not in urls[base_url]: + urls[base_url].append(width) + + for base_url, ignore in urls.items(): + for width in (x for x in self._FORMATS if x not in ignore): + url = '%s/%s-1.mp4' % (base_url, width) + if self._is_valid_url(url, video_id): + format_info = self._FORMATS[width] + format_id = '%s-%s' % (determine_ext(url), format_info['vbr'] + format_info['abr']) + format_info.update({ + 'url': url, + 'format_id': format_id, + }) + format_info.update(self._ALL_FORMATS) + formats.append(format_info) + + +class ARDMediathekIE(ARDBaseIE): IE_NAME = 'ARD:mediathek' _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ + # available till 06.10.2023 + 'url': 'http://mediathek.daserste.de/Das-Wort-zum-Sonntag/Christian-Rommert-Vielen-Dank-/Video?bcastId=442936&documentId=56727614', + 'md5': 'da2c6b8643cdd4a46f6446bf2786f5b6', + 'info_dict': { + 'id': '56727614', + 'ext': 'mp4', + 'title': 'Christian Rommert: Vielen Dank!?', + 'description': 'md5:ef94f0f576290c7c85137174229a21ca', + 'duration': 233, + }, + }, { # available till 26.07.2022 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', + 'md5': '0ab612119ade6214395380723abbfd11', 'info_dict': { 'id': '44726822', 'ext': 'mp4', @@ -33,10 +125,6 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', 'duration': 1740, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872', 'only_matching': True, @@ -58,6 +146,7 @@ class ARDMediathekIE(InfoExtractor): media_info_url, video_id, 'Downloading media JSON') formats = self._extract_formats(media_info, video_id) + self._check_additional_formats(formats, video_id) if not formats: if '"fsk"' in webpage: @@ -93,7 +182,7 @@ class ARDMediathekIE(InfoExtractor): type_ = media_info.get('_type') media_array = media_info.get('_mediaArray', []) formats = [] - for num, media in enumerate(media_array): + for media in media_array: for stream in media.get('_mediaStreamArray', []): stream_urls = stream.get('_stream') if not stream_urls: @@ -108,6 +197,7 @@ class ARDMediathekIE(InfoExtractor): ext = determine_ext(stream_url) if quality != 'auto' and ext in ('f4m', 'm3u8'): continue + if ext == 'f4m': formats.extend(self._extract_f4m_formats( update_url_query(stream_url, { @@ -123,22 +213,38 @@ class ARDMediathekIE(InfoExtractor): f = { 'url': server, 'play_path': stream_url, - 'format_id': 'a%s-rtmp-%s' % (num, quality), + 'format_id': 'rtmp-%s' % quality, } else: f = { 'url': stream_url, - 'format_id': 'a%s-%s-%s' % (num, ext, quality) + 'format_id': '%s-%s' % (ext, quality), } - m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) + + m = re.search(r'.*/([0-9]+)-[0-9]\..*$', stream_url) if m: - f.update({ - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) + width = int_or_none(m.group(1)) + f.update(self._FORMATS.get(width, {})) + f.update(self._ALL_FORMATS) + f.update({'format_id': '%s-%s' % (ext, f['vbr'] + f['abr'])}) + else: + width = stream.get('_width') + height = stream.get('_height') + if not width and not height: + m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) + if m: + width = int(m.group('width')) + height = int(m.group('height')) + if width and height: + f.update({ + 'width': width, + 'height': height, + }) + if type_ == 'audio': f['vcodec'] = 'none' formats.append(f) + return formats def _real_extract(self, url): @@ -204,6 +310,7 @@ class ARDMediathekIE(InfoExtractor): 'format_id': fid, 'url': furl, }) + self._check_additional_formats(formats, video_id) self._sort_formats(formats) info = { 'formats': formats, @@ -220,18 +327,19 @@ class ARDMediathekIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if info.get('is_live') else title, 'description': description, - 'thumbnail': thumbnail, }) + if thumbnail: + info['thumbnail'] = thumbnail return info -class ARDIE(InfoExtractor): +class ARDIE(ARDBaseIE): _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(-folgen-verpasst)?/(?P[^/?#]+)-(?P[0-9]+))\.html' _TESTS = [{ # available till 14.02.2019 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + 'md5': '19308261237ed95f2293b05eabede2d0', 'info_dict': { 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', 'id': '102', @@ -261,22 +369,47 @@ class ARDIE(InfoExtractor): thumbnail = xpath_text(video_node, './/teaserImage//variant/url') formats = [] + format_ids = {} for a in video_node.findall('.//asset'): - f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), - } if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + url = a.find('./serverPrefix').text + playpath = a.find('./fileName').text else: - f['url'] = a.find('./fileName').text - formats.append(f) + url = a.find('./fileName').text + playpath = None + + if url.endswith('.f4m'): + formats.extend(self._extract_f4m_formats( + url + '?hdcore=3.11.0', + display_id, f4m_id='hds', fatal=False)) + elif url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats( + url, display_id, m3u8_id='hls', fatal=False)) + else: + tbr = int_or_none(a.find('./totalBitrate').text) + format_id = '%s-%s' % (determine_ext(url), tbr) + if 'HbbTV' in a.attrib['type']: + continue + if format_id in format_ids: + format_ids[format_id] += 1 + format_id += '-%s' % format_ids[format_id] + else: + format_ids[format_id] = 1 + f = { + 'url': url, + 'playpath': playpath, + 'format_id': format_id, + 'width': int_or_none(a.find('./frameWidth').text), + 'height': int_or_none(a.find('./frameHeight').text), + 'vbr': int_or_none(a.find('./bitrateVideo').text), + 'abr': int_or_none(a.find('./bitrateAudio').text), + 'vcodec': a.find('./codecVideo').text, + 'acodec': 'LC-AAC', + 'tbr': tbr, + } + formats.append(f) + + self._check_additional_formats(formats, display_id) self._sort_formats(formats) return { @@ -290,18 +423,18 @@ class ARDIE(InfoExtractor): } -class ARDBetaMediathekIE(InfoExtractor): +class ARDBetaMediathekIE(ARDBaseIE): _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P[a-zA-Z0-9]+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', - 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', + 'md5': '7338f01de1ca7af9538cc0bdfc438dd1', 'info_dict': { 'display_id': 'die-robuste-roswita', 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'title': 'Tatort: Die robuste Roswita', 'description': r're:^Der Mord.*trĂ¼ber ist als die Ilm.', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', + 'thumbnail': r're:^https?://img.ardmediathek.de.*$', 'upload_date': '20180826', 'ext': 'mp4', }, @@ -348,12 +481,24 @@ class ARDBetaMediathekIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: - formats.append({ - 'format_id': 'http-' + widget['_quality'], - 'url': format_url, - 'preference': 10, # Plain HTTP, that's nice - }) + m = re.search(r'.*/([0-9]+)-[0-9]\..*$', format_url) + width = int_or_none(m.group(1)) if m else None + if width and width in self._FORMATS: + info = self._FORMATS[width] + info.update({ + 'format_id': '%s-%s' % (determine_ext(format_url), info['vbr'] + info['abr']), + 'url': format_url, + }) + info.update(self._ALL_FORMATS) + formats.append(info) + else: + formats.append({ + 'format_id': '%s-%s' % (determine_ext(format_url), widget['_quality']), + 'url': format_url, + }) + self._remove_duplicate_formats(formats) + self._check_additional_formats(formats, video_id) self._sort_formats(formats) res['formats'] = formats From f6e1ef674b15cb725aad229511e9502f6ff3c31e Mon Sep 17 00:00:00 2001 From: Niklas Date: Thu, 11 Oct 2018 23:58:31 +0200 Subject: [PATCH 11/15] [zdf] rework format extraction to find additional videos (fixes #17747) --- youtube_dl/extractor/zdf.py | 316 ++++++++++++++++++++++++++---------- 1 file changed, 231 insertions(+), 85 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index afa3f6c47..81345a130 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -11,7 +11,6 @@ from ..utils import ( NO_DEFAULT, orderedSet, parse_codecs, - qualities, try_get, unified_timestamp, update_url_query, @@ -20,7 +19,137 @@ from ..utils import ( ) -class ZDFBaseIE(InfoExtractor): +class ZDFIE(InfoExtractor): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + + _TESTS = [{ + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'info_dict': { + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', + 'ext': 'mp4', + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + 'url': 'https://www.zdf.de/dokumentation/terra-x/mit-antischwerkraft-zu-den-sternen-100.html', + 'md5': 'dede0475add7c2d1fa067358a636e80e', + 'info_dict': { + 'id': 'mit-antischwerkraft-zu-den-sternen-100', + 'ext': 'mp4', + 'title': 'Mit Antischwerkraft zu den Sternen?', + 'description': 'md5:44c0214d0bd2f41a5200af6b38e15186', + 'duration': 311, + 'timestamp': 1538294400, + 'upload_date': '20180930', + } + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, + }] + + _MP4_URL_REGEX = r'^(?P((https?:)?//)?(.*))_(?P[0-9]+)k_p(?P

[0-9]{1,})v(?P[0-9]{1,})\.(?P.{2,3})$' + + _H264_MAIN_L31 = 'avc1.4d001f' + _H264_HIGH_L4 = 'avc1.640028' + + # https://github.com/mediathekview/MServer/blob/master/src/main/java/mServer/crawler/sender/MediathekZdf.java + _BITRATES = { + 11: { + 35: [{ + 'tbr': 2328, + 'width': 1024, + 'height': 576, + 'vcodec': _H264_MAIN_L31, + }], + }, + 12: { + 14: [{ + 'tbr': 2256, + 'width': 1024, + 'height': 576, + 'vcodec': _H264_MAIN_L31, + }], + 15: [{ + 'tbr': 3256, + 'width': 1280, + 'height': 720, + 'vcodec': _H264_HIGH_L4, + }], + 35: [{ + 'tbr': 2328, + 'width': 1024, + 'height': 576, + 'vcodec': _H264_MAIN_L31, + }], + 36: [{ + 'tbr': 3328, + 'width': 1280, + 'height': 720, + 'vcodec': _H264_HIGH_L4, + }], + }, + 13: { + 14: [{ + 'tbr': 2296, + 'width': 1024, + 'height': 576, + 'vcodec': _H264_MAIN_L31, + }], + 15: [{ + 'tbr': 3296, + 'width': 1280, + 'height': 720, + 'vcodec': _H264_HIGH_L4, + }], + 35: [{ + 'tbr': 2328, + 'width': 1024, + 'height': 576, + 'vcodec': _H264_MAIN_L31, + }], + 36: [{ + 'tbr': 3328, + 'width': 1280, + 'height': 720, + 'vcodec': _H264_HIGH_L4, + }], + }, + 14: { + 14: [{ + 'tbr': 2296, + 'width': 1024, + 'height': 576, + 'vcodec': _H264_MAIN_L31, + }], + 35: [{ + 'tbr': 3328, + 'width': 1280, + 'height': 720, + 'vcodec': _H264_HIGH_L4, + }, { + 'tbr': 2328, + 'width': 1024, + 'height': 576, + 'vcodec': _H264_MAIN_L31, + }], + 36: [{ + 'tbr': 3328, + 'width': 1280, + 'height': 720, + 'vcodec': _H264_HIGH_L4, + }], + }, + } + def _call_api(self, url, player, referrer, video_id, item): return self._download_json( url, video_id, 'Downloading JSON %s' % item, @@ -37,32 +166,25 @@ class ZDFBaseIE(InfoExtractor): group='json'), video_id) + def _get_max_bitrate(self, url): + m = re.search(self._MP4_URL_REGEX, url) + if m: + return int_or_none(m.group('bitrate')) + return None -class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') - - _TESTS = [{ - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', - 'info_dict': { - 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', - 'ext': 'mp4', - 'title': 'Die Magie der Farben (2/2)', - 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'duration': 2615, - 'timestamp': 1465021200, - 'upload_date': '20160604', - }, - }, { - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', - 'only_matching': True, - }] + @staticmethod + def _guess_resolution(bitrate): + if bitrate < 400: + return {'width': 320, 'height': 176} + if 400 <= bitrate < 500: + return {'width': 480, 'height': 272} + if 500 <= bitrate < 1000: + return {'width': 640, 'height': 360} + if 1000 <= bitrate < 1500: + return {'width': 852, 'height': 480} + if 1500 <= bitrate < 2000: + return {'width': 1024, 'height': 576} + return {'width': 1280, 'height': 720} @staticmethod def _extract_subtitles(src): @@ -76,6 +198,65 @@ class ZDFIE(ZDFBaseIE): }) return subtitles + @staticmethod + def _set_language(formats, lang): + if not lang: + return + for format in formats: + format['language'] = lang + + @staticmethod + def _find_single_language(formats): + first_lang = None + for format in formats: + lang = format.get('language') + if lang and not first_lang: + first_lang = lang + continue + if lang != first_lang: + return + return first_lang + + def _find_additional_formats(self, formats, video_id, lang=None): + present = {} + for format in formats: + url = format.get('url') + if not url: + continue + m = re.match(self._MP4_URL_REGEX, url) + if not m: + continue + base_url = m.group('base_url') + p = int_or_none(m.group('p')) + v = int_or_none(m.group('v')) + if not p or not v: + continue + if base_url not in present: + present[base_url] = {v: [p]} + elif v not in present[base_url]: + present[base_url][v] = [p] + elif p not in present[base_url][v]: + present[base_url][v].append(p) + + for base_url, vs in present.items(): + for v, ps in vs.items(): + for p, variants in (x for x in self._BITRATES.get(v, {}).items() if x[0] not in ps): + for f in variants: + f = dict(f) + url = '%s_%sk_p%sv%s.mp4' % (base_url, f['tbr'], p, v) + if self._is_valid_url(url, video_id): + f.update({ + 'url': url, + 'format_id': 'mp4-%s' % f['tbr'], + 'ext': 'mp4', + 'language': lang, + 'acodec': 'mp4a.40.2', + }) + if 'nrodlzdf' in url: + f['format_id'] += '-alt' + f['source_preference'] = -2 + formats.append(f) + def _extract_format(self, video_id, formats, format_urls, meta): format_url = url_or_none(meta.get('url')) if not format_url: @@ -86,26 +267,33 @@ class ZDFIE(ZDFBaseIE): mime_type = meta.get('mimeType') ext = determine_ext(format_url) if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + hls_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False)) + entry_protocol='m3u8_native', fatal=False) + self._set_language(hls_formats, meta.get('language')) + formats.extend(hls_formats) elif mime_type == 'application/f4m+xml' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) + hds_formats = self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + video_id, f4m_id='hds', fatal=False) + self._set_language(hds_formats, meta.get('language')) + formats.extend(hds_formats) else: f = parse_codecs(meta.get('mimeCodec')) - format_id = ['http'] - for p in (meta.get('type'), meta.get('quality')): - if p and isinstance(p, compat_str): - format_id.append(p) + bitrate = self._get_max_bitrate(format_url) + format_note = meta.get('quality') f.update({ 'url': format_url, - 'format_id': '-'.join(format_id), - 'format_note': meta.get('quality'), + 'format_id': 'mp4-%s' % bitrate or format_note or '0', + 'ext': ext, + 'tbr': bitrate, 'language': meta.get('language'), - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - 'preference': -10, }) + if not f.get('width') and not f.get('height') and bitrate: + f.update(self._guess_resolution(bitrate)) + if 'nrodlzdf' in format_url: + f['format_id'] += '-alt' + f['source_preference'] = -2 formats.append(f) def _extract_entry(self, url, player, content, video_id): @@ -143,9 +331,12 @@ class ZDFIE(ZDFBaseIE): 'url': track.get('uri'), 'type': f.get('type'), 'mimeType': f.get('mimeType'), + 'mimeCodec': quality.get('mimeCodec'), 'quality': quality.get('quality'), 'language': track.get('language'), }) + single_lang = self._find_single_language(formats) + self._find_additional_formats(formats, video_id, single_lang) self._sort_formats(formats) thumbnails = [] @@ -235,7 +426,7 @@ class ZDFIE(ZDFBaseIE): return self._extract_mobile(video_id) -class ZDFChannelIE(ZDFBaseIE): +class ZDFChannelIE(InfoExtractor): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', @@ -272,48 +463,3 @@ class ZDFChannelIE(ZDFBaseIE): return self.playlist_result( entries, channel_id, self._og_search_title(webpage, fatal=False)) - - r""" - player = self._extract_player(webpage, channel_id) - - channel_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P(?!\1).+?)\1', webpage, - 'channel id', group='id') - - channel = self._call_api( - 'https://api.zdf.de/content/documents/%s.json' % channel_id, - player, url, channel_id) - - items = [] - for module in channel['module']: - for teaser in try_get(module, lambda x: x['teaser'], list) or []: - t = try_get( - teaser, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - items.extend(try_get( - t, - lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - items.extend(try_get( - module, - lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - - entries = [] - entry_urls = set() - for item in items: - t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - sharing_url = t.get('http://zdf.de/rels/sharing-url') - if not sharing_url or not isinstance(sharing_url, compat_str): - continue - if sharing_url in entry_urls: - continue - entry_urls.add(sharing_url) - entries.append(self.url_result( - sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) - - return self.playlist_result(entries, channel_id, channel.get('title')) - """ From 2d0822bbd05e86ae4a0e35d8a6313f248f88eeef Mon Sep 17 00:00:00 2001 From: Niklas Date: Mon, 24 Dec 2018 03:07:51 +0100 Subject: [PATCH 12/15] [adobepass] support FubuTV MSO --- youtube_dl/extractor/adobepass.py | 61 ++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 1cf2dcbf3..a7834c25d 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import json import re import time import xml.etree.ElementTree as etree @@ -60,6 +62,9 @@ MSO_INFO = { 'username_field': 'IDToken1', 'password_field': 'IDToken2', }, + 'Fubo': { + 'name': 'FuboTV', + }, 'thr030': { 'name': '3 Rivers Communications' }, @@ -1422,11 +1427,11 @@ class AdobePassIE(InfoExtractor): 'domain_name': 'adobe.com', 'redirect_url': url, }) + provider_redirect_page, urlh = provider_redirect_page_res if mso_id == 'Comcast_SSO': # Comcast page flow varies by video site and whether you # are on Comcast's network. - provider_redirect_page, urlh = provider_redirect_page_res if 'automatically signing you in' in provider_redirect_page: oauth_redirect_url = self._html_search_regex( r'window\.location\s*=\s*[\'"]([^\'"]+)', @@ -1458,7 +1463,6 @@ class AdobePassIE(InfoExtractor): elif mso_id == 'Verizon': # In general, if you're connecting from a Verizon-assigned IP, # you will not actually pass your credentials. - provider_redirect_page, urlh = provider_redirect_page_res if 'Please wait ...' in provider_redirect_page: saml_redirect_url = self._html_search_regex( r'self\.parent\.location=(["\'])(?P.+?)\1', @@ -1492,21 +1496,44 @@ class AdobePassIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded' }) else: - # Some providers (e.g. DIRECTV NOW) have another meta refresh - # based redirect that should be followed. - provider_redirect_page, urlh = provider_redirect_page_res - provider_refresh_redirect_url = extract_redirect_url( - provider_redirect_page, url=urlh.geturl()) - if provider_refresh_redirect_url: - provider_redirect_page_res = self._download_webpage_handle( - provider_refresh_redirect_url, video_id, - 'Downloading Provider Redirect Page (meta refresh)') - provider_login_page_res = post_form( - provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) - mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { - mso_info.get('username_field', 'username'): username, - mso_info.get('password_field', 'password'): password, - }) + if mso_id == 'Fubo': + config_b64 = self._html_search_regex(r"window.atob\('(.*)'\)\)\)\);", + provider_redirect_page, 'client id', fatal=True) + config_json = base64.b64decode(config_b64.encode()).decode('ascii') + config = json.loads(config_json) + + post_data = { + 'username': username, + 'password': password, + 'client_id': config['clientID'], + 'tenant': config['auth0Tenant'], + 'sso': True, + 'connection': 'Username-Password-Authentication', + 'redirect_uri': config['callbackURL'], + } + post_data.update(config['extraParams']) + base_url = config.get('authorizationServer', {}).get(url, 'https://fubo.auth0.com') + + mvpd_confirm_page_res = self._download_webpage_handle( + base_url + '/usernamepassword/login', video_id, 'Logging in', + data=json.dumps(post_data).encode(), + headers={'Content-Type': 'application/json'}) + else: + # Some providers (e.g. DIRECTV NOW) have another meta refresh + # based redirect that should be followed. + provider_refresh_redirect_url = extract_redirect_url( + provider_redirect_page, url=urlh.geturl()) + if provider_refresh_redirect_url: + provider_redirect_page_res = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Provider Redirect Page (meta refresh)') + provider_login_page_res = post_form( + provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE) + mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { + mso_info.get('username_field', 'username'): username, + mso_info.get('password_field', 'password'): password, + }) + if mso_id != 'Rogers': post_form(mvpd_confirm_page_res, 'Confirming Login') From ce362cca2467b83eaaa5174bf303d2d88b370806 Mon Sep 17 00:00:00 2001 From: Niklas Date: Mon, 24 Dec 2018 13:45:19 +0100 Subject: [PATCH 13/15] [ard] fix base extractor inheritance --- youtube_dl/extractor/ard.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index e3bf0bd77..dedfabe3a 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -438,8 +438,7 @@ class ARDIE(ARDBaseIE): } - -class ARDBetaMediathekIE(InfoExtractor): +class ARDBetaMediathekIE(ARDBaseIE): _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', From 343536c60f345d6e3c6c7f2307433c76938064af Mon Sep 17 00:00:00 2001 From: Niklas Date: Mon, 24 Dec 2018 13:53:58 +0100 Subject: [PATCH 14/15] [ard] adjust format id for additional formats --- youtube_dl/extractor/ard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index dedfabe3a..7f09cd6ae 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -93,7 +93,7 @@ class ARDBaseIE(InfoExtractor): url = '%s/%s-1.mp4' % (base_url, width) if self._is_valid_url(url, video_id): format_info = self._FORMATS[width] - format_id = '%s-%s' % (determine_ext(url), format_info['vbr'] + format_info['abr']) + format_id = 'http-%s-%s' % (determine_ext(url), format_info['vbr'] + format_info['abr']) format_info.update({ 'url': url, 'format_id': format_id, From c0ae75b45ca8b7bb30d702a6d0e09a5c1272a309 Mon Sep 17 00:00:00 2001 From: Niklas Date: Mon, 24 Dec 2018 13:56:42 +0100 Subject: [PATCH 15/15] [ard] don't stop extracting when possibly geoblocked --- youtube_dl/extractor/ard.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 7f09cd6ae..5b941dcf0 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -478,7 +478,6 @@ class ARDBetaMediathekIE(ARDBaseIE): subtitles = {} geoblocked = False for widget in data.values(): - if widget.get('_geoblocked') is True: geoblocked = True if '_duration' in widget: @@ -512,10 +511,6 @@ class ARDBetaMediathekIE(ARDBaseIE): format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: - # HTTP formats are not available when geoblocked is True, - # other formats are fine though - if geoblocked: - continue m = re.search(r'.*/([0-9]+)-[0-9]\..*$', format_url) width = int_or_none(m.group(1)) if m else None if width and width in self._FORMATS: