From c5e61e32dac1a4f0a324b656e0c9ba45bfae9465 Mon Sep 17 00:00:00 2001 From: felix Date: Mon, 7 Nov 2016 15:45:42 +0100 Subject: [PATCH] [common] extract subtitles info from m3u8 media _extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles and extended to properly handle subtitle references; a wrapper with the old name is provided for compatibility. _parse_m3u8_formats is likewise renamed and extended, but without adding the compatibility wrapper; the test suite is adjusted to test the enhanced method instead. --- test/test_InfoExtractor.py | 29 +++++++++++++++++++++-------- youtube_dl/extractor/common.py | 27 ++++++++++++++++++++------- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 71f6608fe..d4f12848c 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -438,7 +438,14 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 1467, 'width': 1024, 'height': 576, - }] + }], + { + 'fra': [{ + 'url': 'http://replayftv-pmd.francetv.fr/subtitles/2017/16/156589847-1492488987.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_webvtt' + }] + }, ), ( # https://github.com/ytdl-org/youtube-dl/issues/11995 @@ -512,7 +519,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 2374, 'width': 1024, 'height': 576, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/issues/12211 @@ -571,7 +579,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 1396.736, 'width': 854, 'height': 480, - }] + }], + {}, ), ( # http://www.twitch.tv/riotgames/v/6528877 @@ -641,7 +650,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 3214.134, 'width': 1280, 'height': 720, - }] + }], + {}, ), ( # http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 @@ -676,7 +686,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 1200, 'width': 1280, 'height': 720, - }] + }], + {} ), ( # https://github.com/ytdl-org/youtube-dl/issues/18923 @@ -733,17 +744,19 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'acodec': 'none', 'width': 1280, 'height': 720, - }] + }], + {} ), ] - for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: + for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES: with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_m3u8_formats( + formats, subs = self.ie._parse_m3u8_formats_and_subtitles( f.read(), m3u8_url, ext='mp4') self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subs, expected_subs, None) def test_parse_mpd_formats(self): _TEST_CASES = [ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484..824773c6b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1334,7 +1334,6 @@ class InfoExtractor(object): def _formats_key(f): # TODO remove the following workaround - from ..utils import determine_ext if not f.get('ext') and 'url' in f: f['ext'] = determine_ext(f['url']) @@ -1583,7 +1582,11 @@ class InfoExtractor(object): 'format_note': 'Quality selection URL', } - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, + def _extract_m3u8_formats(self, *args, **kwargs): + fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs) + return fmts + + def _extract_m3u8_formats_and_subtitles(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, fatal=True, live=False, data=None, headers={}, @@ -1595,26 +1598,28 @@ class InfoExtractor(object): fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} m3u8_doc, urlh = res m3u8_url = urlh.geturl() - return self._parse_m3u8_formats( + return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, preference=preference, m3u8_id=m3u8_id, live=live) - def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, + def _parse_m3u8_formats_and_subtitles(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, live=False): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return [] + return [], {} if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay return [] formats = [] + subtitles = {} + format_url = lambda u: ( u if re.match(r'^https?://', u) @@ -1655,6 +1660,14 @@ class InfoExtractor(object): if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) + # + if media_type == 'SUBTITLES': + lang = media['LANGUAGE'] # XXX: normalise? + sub_info = { + 'url': media['URI'], + 'ext': determine_ext(media['URI']) + } + subtitles.setdefault(lang, []).append(sub_info) if media_type not in ('VIDEO', 'AUDIO'): return media_url = media.get('URI') @@ -1780,7 +1793,7 @@ class InfoExtractor(object): formats.append(http_f) last_stream_inf = {} - return formats + return formats, subtitles @staticmethod def _xpath_ns(path, namespace=None):