[hls] add HLS WebVTT downloader; also, enable subtitle downloading with it

2025-03-12 17:30:01 +08:00 · 2016-11-07 15:45:43 +01:00 · 2016-11-07 15:45:43 +01:00 · f32961f201
commit f32961f201
parent c5e61e32da
4 changed files with 168 additions and 4 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -1830,10 +1830,19 @@ class YoutubeDL(object):
                            return
                    else:
                        try:
-                            sub_data = ie._request_webpage(
-                                sub_info['url'], info_dict['id'], note=False).read()
-                            with io.open(encodeFilename(sub_filename), 'wb') as subfile:
-                                subfile.write(sub_data)
+                            if sub_info.get('protocol') is not None:
+                                sub_info_dict = {
+                                    'id': info_dict['id'],
+                                    'protocol': sub_info['protocol'],
+                                    'url': sub_info['url']
+                                }
+                                sub_fd = get_suitable_downloader(sub_info_dict)(self, self.params)
+                                sub_fd.download(sub_filename, sub_info_dict)
+                            else:
+                                sub_data = ie._request_webpage(
+                                    sub_info['url'], info_dict['id'], note=False).read()
+                                with io.open(encodeFilename(sub_filename), 'wb') as subfile:
+                                    subfile.write(sub_data)
                        except (ExtractorError, IOError, OSError, ValueError) as err:
                            self.report_warning('Unable to download subtitle for "%s": %s' %
                                                (sub_lang, error_to_compat_str(err)))
--- a/youtube_dl/downloader/init.py
+++ b/youtube_dl/downloader/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from .common import FileDownloader
 from .f4m import F4mFD
 from .hls import HlsFD
+from .hls import WebVttHlsFD
 from .http import HttpFD
 from .rtmp import RtmpFD
 from .dash import DashSegmentsFD
@ -20,6 +21,7 @@ from ..utils import (
 PROTOCOL_MAP = {
    'rtmp': RtmpFD,
    'm3u8_native': HlsFD,
+    'm3u8_webvtt': WebVttHlsFD,
    'm3u8': FFmpegFD,
    'mms': RtspFD,
    'rtsp': RtspFD,
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@ -208,3 +208,153 @@ class HlsFD(FragmentFD):
        self._finish_frag_download(ctx)

        return True
+
+
+class WebVttHlsFD(FragmentFD):
+    """ A downloader for HLS WebVTT subtitles. """
+    FD_NAME = 'hlswebvtt'
+
+    @staticmethod
+    def _parse_ts(ts):
+        m = re.match('(?:(?:([0-9]+):)?([0-9]+):)?([0-9]+)(?:\.([0-9]+))?', ts)
+        hrs, min, sec, msc = m.groups()
+        return 90 * (
+            int(hrs or 0) * 3600000 +
+            int(min or 0) *   60000 +
+            int(sec or 0) *    1000 +
+            int(msc or 0)
+        )
+
+    @staticmethod
+    def _format_ts(ts):
+        ts  = int(ts / 90)
+        hrs = ts / 3600000
+        ts %=      3600000
+        min = ts /   60000
+        ts %=        60000
+        sec = ts /    1000
+        ts %=         1000
+        return '%02u:%02u:%02u.%03u' % (hrs, min, sec, ts)
+
+    def real_download(self, filename, info_dict):
+        url = info_dict['url']
+        self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
+        data = self.ydl.urlopen(url).read()
+        s = data.decode('utf-8', 'ignore')
+        segment_urls = []
+        for line in s.splitlines():
+            line = line.strip()
+            if line and not line.startswith('#'):
+                segment_url = (
+                    line if re.match(r'^https?://', line)
+                    else compat_urlparse.urljoin(url, line))
+                segment_urls.append(segment_url)
+
+        ctx = {
+            'filename': filename,
+            'total_frags': len(segment_urls),
+        }
+
+        self._prepare_and_start_frag_download(ctx)
+
+        cues = []
+        header = []
+        frags_filenames = []
+        for i, frag_url in enumerate(segment_urls):
+            frag_name = 'Frag%d' % i
+            frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)
+
+            success = ctx['dl'].download(frag_filename, {'url': frag_url})
+            if not success:
+                return False
+            down, frag_sanitized = sanitize_open(frag_filename, 'rb')
+            lines = down.read().decode('utf-8', 'ignore').splitlines()
+            down.close()
+            frags_filenames.append(frag_sanitized)
+
+            line_iter = iter(lines)
+            line = next(line_iter)
+            if not line.startswith('WEBVTT'):
+                self.report_error('Not a valid WebVTT subtitles segment')
+            if len(line) > 6 and not (line.startswith('WEBVTT ') or line.startswith('WEBVTT\t')):
+                self.report_error('Not a valid WebVTT subtitles segment')
+
+            try:
+                # read header
+                tsadj = 0
+                while True:
+                    line = next(line_iter)
+                    if line == '':
+                        break
+                    elif line.find('-->') != -1:
+                        break
+
+                    if line.startswith('X-TIMESTAMP-MAP='):
+                        m = re.search(r'LOCAL:([0-9:.]+)', line)
+                        locl_ts = self._parse_ts(m.group(1))
+                        m = re.search(r'MPEGTS:([0-9]+)', line)
+                        mpeg_ts = int(m.group(1))
+                        tsadj = mpeg_ts - locl_ts
+                    else:
+                        header.append(line)
+
+                subtitle = None
+                while True:
+                    while line == '':
+                        line = next(line_iter)
+                    cue = {}
+
+                    if line.find('-->') == -1:
+                        cue['id'] = line
+                        line = next(line_iter)
+                        if line == '':
+                            continue
+
+                    m = re.match(r'^([0-9:.]+\s*)-->\s*([0-9:.]+)(\s+.*)?', line)
+                    if m:
+                        ts_start = self._parse_ts(m.group(1))
+                        ts_end   = self._parse_ts(m.group(2))
+                        cue['style'] = m.group(3) or ''
+                    else:
+                        continue
+
+                    ts_start += tsadj
+                    ts_end   += tsadj
+
+                    cue['start_ts'] = self._format_ts(ts_start)
+                    cue['end_ts'] = self._format_ts(ts_end)
+
+                    line = next(line_iter)
+
+                    cue['text'] = ''
+
+                    try:
+                        while line != '':
+                            if line.find('-->') != -1:
+                                break
+                            cue['text'] += line + '\n'
+                            line = next(line_iter)
+                    finally:
+                        cues.append(cue)
+            except StopIteration:
+                pass
+
+        cues.sort(key=lambda cue: cue['start_ts'])
+        with ctx['dest_stream'] as outf:
+            outf.write(b'WEBVTT\n')
+            for item in header:
+                outf.write(('%s\n' % item).encode('utf-8'))
+            for cue in cues:
+                outf.write(b'\n')
+                if cue.get('id'):
+                    outf.write(('%s\n' % cue['id']).encode('utf-8'))
+                outf.write(
+                    ('%s --> %s%s\n' % (cue['start_ts'], cue['end_ts'], cue['style']))
+                        .encode('utf-8')
+                )
+                outf.write(cue['text'].encode('utf-8'))
+
+        self._finish_frag_download(ctx)
+
+        for frag_file in frags_filenames:
+            os.remove(encodeFilename(frag_file))
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1667,6 +1667,9 @@ class InfoExtractor(object):
                    'url': media['URI'],
                    'ext': determine_ext(media['URI'])
                }
+                if sub_info['ext'] == 'm3u8': # XXX
+                    sub_info['ext'] = 'vtt'
+                    sub_info['protocol'] = 'm3u8_webvtt'
                subtitles.setdefault(lang, []).append(sub_info)
            if media_type not in ('VIDEO', 'AUDIO'):
                return