[hls] add HLS WebVTT downloader; also, enable subtitle downloading with it

2025-03-12 23:07:29 +08:00 · 2016-11-07 15:45:43 +01:00 · 2016-11-07 15:45:43 +01:00 · f32961f201
commit f32961f201
parent c5e61e32da
4 changed files with 168 additions and 4 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -1830,10 +1830,19 @@ class YoutubeDL(object):
                            return
                    else:
                        try:
-                            sub_data = ie._request_webpage(
+                            if sub_info.get('protocol') is not None:
-                                sub_info['url'], info_dict['id'], note=False).read()
+                                sub_info_dict = {
-                            with io.open(encodeFilename(sub_filename), 'wb') as subfile:
+                                    'id': info_dict['id'],
-                                subfile.write(sub_data)
+                                    'protocol': sub_info['protocol'],
                                    'url': sub_info['url']
                                }
                                sub_fd = get_suitable_downloader(sub_info_dict)(self, self.params)
                                sub_fd.download(sub_filename, sub_info_dict)
                            else:
                                sub_data = ie._request_webpage(
                                    sub_info['url'], info_dict['id'], note=False).read()
                                with io.open(encodeFilename(sub_filename), 'wb') as subfile:
                                    subfile.write(sub_data)
                        except (ExtractorError, IOError, OSError, ValueError) as err:
                            self.report_warning('Unable to download subtitle for "%s": %s' %
                                                (sub_lang, error_to_compat_str(err)))
--- a/youtube_dl/downloader/init.py
+++ b/youtube_dl/downloader/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from .common import FileDownloader
 from .f4m import F4mFD
 from .hls import HlsFD
 from .hls import WebVttHlsFD
 from .http import HttpFD
 from .rtmp import RtmpFD
 from .dash import DashSegmentsFD
@ -20,6 +21,7 @@ from ..utils import (
 PROTOCOL_MAP = {
    'rtmp': RtmpFD,
    'm3u8_native': HlsFD,
    'm3u8_webvtt': WebVttHlsFD,
    'm3u8': FFmpegFD,
    'mms': RtspFD,
    'rtsp': RtspFD,
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@ -208,3 +208,153 @@ class HlsFD(FragmentFD):
        self._finish_frag_download(ctx)
        return True
 class WebVttHlsFD(FragmentFD):
    """ A downloader for HLS WebVTT subtitles. """
    FD_NAME = 'hlswebvtt'
    @staticmethod
    def _parse_ts(ts):
        m = re.match('(?:(?:([0-9]+):)?([0-9]+):)?([0-9]+)(?:\.([0-9]+))?', ts)
        hrs, min, sec, msc = m.groups()
        return 90 * (
            int(hrs or 0) * 3600000 +
            int(min or 0) *   60000 +
            int(sec or 0) *    1000 +
            int(msc or 0)
        )
    @staticmethod
    def _format_ts(ts):
        ts  = int(ts / 90)
        hrs = ts / 3600000
        ts %=      3600000
        min = ts /   60000
        ts %=        60000
        sec = ts /    1000
        ts %=         1000
        return '%02u:%02u:%02u.%03u' % (hrs, min, sec, ts)
    def real_download(self, filename, info_dict):
        url = info_dict['url']
        self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
        data = self.ydl.urlopen(url).read()
        s = data.decode('utf-8', 'ignore')
        segment_urls = []
        for line in s.splitlines():
            line = line.strip()
            if line and not line.startswith('#'):
                segment_url = (
                    line if re.match(r'^https?://', line)
                    else compat_urlparse.urljoin(url, line))
                segment_urls.append(segment_url)
        ctx = {
            'filename': filename,
            'total_frags': len(segment_urls),
        }
        self._prepare_and_start_frag_download(ctx)
        cues = []
        header = []
        frags_filenames = []
        for i, frag_url in enumerate(segment_urls):
            frag_name = 'Frag%d' % i
            frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)
            success = ctx['dl'].download(frag_filename, {'url': frag_url})
            if not success:
                return False
            down, frag_sanitized = sanitize_open(frag_filename, 'rb')
            lines = down.read().decode('utf-8', 'ignore').splitlines()
            down.close()
            frags_filenames.append(frag_sanitized)
            line_iter = iter(lines)
            line = next(line_iter)
            if not line.startswith('WEBVTT'):
                self.report_error('Not a valid WebVTT subtitles segment')
            if len(line) > 6 and not (line.startswith('WEBVTT ') or line.startswith('WEBVTT\t')):
                self.report_error('Not a valid WebVTT subtitles segment')
            try:
                # read header
                tsadj = 0
                while True:
                    line = next(line_iter)
                    if line == '':
                        break
                    elif line.find('-->') != -1:
                        break
                    if line.startswith('X-TIMESTAMP-MAP='):
                        m = re.search(r'LOCAL:([0-9:.]+)', line)
                        locl_ts = self._parse_ts(m.group(1))
                        m = re.search(r'MPEGTS:([0-9]+)', line)
                        mpeg_ts = int(m.group(1))
                        tsadj = mpeg_ts - locl_ts
                    else:
                        header.append(line)
                subtitle = None
                while True:
                    while line == '':
                        line = next(line_iter)
                    cue = {}
                    if line.find('-->') == -1:
                        cue['id'] = line
                        line = next(line_iter)
                        if line == '':
                            continue
                    m = re.match(r'^([0-9:.]+\s*)-->\s*([0-9:.]+)(\s+.*)?', line)
                    if m:
                        ts_start = self._parse_ts(m.group(1))
                        ts_end   = self._parse_ts(m.group(2))
                        cue['style'] = m.group(3) or ''
                    else:
                        continue
                    ts_start += tsadj
                    ts_end   += tsadj
                    cue['start_ts'] = self._format_ts(ts_start)
                    cue['end_ts'] = self._format_ts(ts_end)
                    line = next(line_iter)
                    cue['text'] = ''
                    try:
                        while line != '':
                            if line.find('-->') != -1:
                                break
                            cue['text'] += line + '\n'
                            line = next(line_iter)
                    finally:
                        cues.append(cue)
            except StopIteration:
                pass
        cues.sort(key=lambda cue: cue['start_ts'])
        with ctx['dest_stream'] as outf:
            outf.write(b'WEBVTT\n')
            for item in header:
                outf.write(('%s\n' % item).encode('utf-8'))
            for cue in cues:
                outf.write(b'\n')
                if cue.get('id'):
                    outf.write(('%s\n' % cue['id']).encode('utf-8'))
                outf.write(
                    ('%s --> %s%s\n' % (cue['start_ts'], cue['end_ts'], cue['style']))
                        .encode('utf-8')
                )
                outf.write(cue['text'].encode('utf-8'))
        self._finish_frag_download(ctx)
        for frag_file in frags_filenames:
            os.remove(encodeFilename(frag_file))
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1667,6 +1667,9 @@ class InfoExtractor(object):
                    'url': media['URI'],
                    'ext': determine_ext(media['URI'])
                }
                if sub_info['ext'] == 'm3u8': # XXX
                    sub_info['ext'] = 'vtt'
                    sub_info['protocol'] = 'm3u8_webvtt'
                subtitles.setdefault(lang, []).append(sub_info)
            if media_type not in ('VIDEO', 'AUDIO'):
                return