From f32961f2017593ee9303c250c5f2532c3a4f8bd2 Mon Sep 17 00:00:00 2001 From: felix Date: Mon, 7 Nov 2016 15:45:43 +0100 Subject: [PATCH] [hls] add HLS WebVTT downloader; also, enable subtitle downloading with it --- youtube_dl/YoutubeDL.py | 17 +++- youtube_dl/downloader/__init__.py | 2 + youtube_dl/downloader/hls.py | 150 ++++++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 3 + 4 files changed, 168 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f5cb46308..09b6e5e3c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1830,10 +1830,19 @@ class YoutubeDL(object): return else: try: - sub_data = ie._request_webpage( - sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: - subfile.write(sub_data) + if sub_info.get('protocol') is not None: + sub_info_dict = { + 'id': info_dict['id'], + 'protocol': sub_info['protocol'], + 'url': sub_info['url'] + } + sub_fd = get_suitable_downloader(sub_info_dict)(self, self.params) + sub_fd.download(sub_filename, sub_info_dict) + else: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) except (ExtractorError, IOError, OSError, ValueError) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 2e485df9d..175d9cfa7 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import FileDownloader from .f4m import F4mFD from .hls import HlsFD +from .hls import WebVttHlsFD from .http import HttpFD from .rtmp import RtmpFD from .dash import DashSegmentsFD @@ -20,6 +21,7 @@ from ..utils import ( PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': HlsFD, + 'm3u8_webvtt': WebVttHlsFD, 'm3u8': FFmpegFD, 'mms': RtspFD, 'rtsp': RtspFD, diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index b59aad73f..138afe605 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -208,3 +208,153 @@ class HlsFD(FragmentFD): self._finish_frag_download(ctx) return True + + +class WebVttHlsFD(FragmentFD): + """ A downloader for HLS WebVTT subtitles. """ + FD_NAME = 'hlswebvtt' + + @staticmethod + def _parse_ts(ts): + m = re.match('(?:(?:([0-9]+):)?([0-9]+):)?([0-9]+)(?:\.([0-9]+))?', ts) + hrs, min, sec, msc = m.groups() + return 90 * ( + int(hrs or 0) * 3600000 + + int(min or 0) * 60000 + + int(sec or 0) * 1000 + + int(msc or 0) + ) + + @staticmethod + def _format_ts(ts): + ts = int(ts / 90) + hrs = ts / 3600000 + ts %= 3600000 + min = ts / 60000 + ts %= 60000 + sec = ts / 1000 + ts %= 1000 + return '%02u:%02u:%02u.%03u' % (hrs, min, sec, ts) + + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + data = self.ydl.urlopen(url).read() + s = data.decode('utf-8', 'ignore') + segment_urls = [] + for line in s.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + segment_url = ( + line if re.match(r'^https?://', line) + else compat_urlparse.urljoin(url, line)) + segment_urls.append(segment_url) + + ctx = { + 'filename': filename, + 'total_frags': len(segment_urls), + } + + self._prepare_and_start_frag_download(ctx) + + cues = [] + header = [] + frags_filenames = [] + for i, frag_url in enumerate(segment_urls): + frag_name = 'Frag%d' % i + frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name) + + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + lines = down.read().decode('utf-8', 'ignore').splitlines() + down.close() + frags_filenames.append(frag_sanitized) + + line_iter = iter(lines) + line = next(line_iter) + if not line.startswith('WEBVTT'): + self.report_error('Not a valid WebVTT subtitles segment') + if len(line) > 6 and not (line.startswith('WEBVTT ') or line.startswith('WEBVTT\t')): + self.report_error('Not a valid WebVTT subtitles segment') + + try: + # read header + tsadj = 0 + while True: + line = next(line_iter) + if line == '': + break + elif line.find('-->') != -1: + break + + if line.startswith('X-TIMESTAMP-MAP='): + m = re.search(r'LOCAL:([0-9:.]+)', line) + locl_ts = self._parse_ts(m.group(1)) + m = re.search(r'MPEGTS:([0-9]+)', line) + mpeg_ts = int(m.group(1)) + tsadj = mpeg_ts - locl_ts + else: + header.append(line) + + subtitle = None + while True: + while line == '': + line = next(line_iter) + cue = {} + + if line.find('-->') == -1: + cue['id'] = line + line = next(line_iter) + if line == '': + continue + + m = re.match(r'^([0-9:.]+\s*)-->\s*([0-9:.]+)(\s+.*)?', line) + if m: + ts_start = self._parse_ts(m.group(1)) + ts_end = self._parse_ts(m.group(2)) + cue['style'] = m.group(3) or '' + else: + continue + + ts_start += tsadj + ts_end += tsadj + + cue['start_ts'] = self._format_ts(ts_start) + cue['end_ts'] = self._format_ts(ts_end) + + line = next(line_iter) + + cue['text'] = '' + + try: + while line != '': + if line.find('-->') != -1: + break + cue['text'] += line + '\n' + line = next(line_iter) + finally: + cues.append(cue) + except StopIteration: + pass + + cues.sort(key=lambda cue: cue['start_ts']) + with ctx['dest_stream'] as outf: + outf.write(b'WEBVTT\n') + for item in header: + outf.write(('%s\n' % item).encode('utf-8')) + for cue in cues: + outf.write(b'\n') + if cue.get('id'): + outf.write(('%s\n' % cue['id']).encode('utf-8')) + outf.write( + ('%s --> %s%s\n' % (cue['start_ts'], cue['end_ts'], cue['style'])) + .encode('utf-8') + ) + outf.write(cue['text'].encode('utf-8')) + + self._finish_frag_download(ctx) + + for frag_file in frags_filenames: + os.remove(encodeFilename(frag_file)) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 824773c6b..febed514b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1667,6 +1667,9 @@ class InfoExtractor(object): 'url': media['URI'], 'ext': determine_ext(media['URI']) } + if sub_info['ext'] == 'm3u8': # XXX + sub_info['ext'] = 'vtt' + sub_info['protocol'] = 'm3u8_webvtt' subtitles.setdefault(lang, []).append(sub_info) if media_type not in ('VIDEO', 'AUDIO'): return