1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-12 23:07:29 +08:00

[hls] add HLS WebVTT downloader; also, enable subtitle downloading with it

This commit is contained in:
felix 2016-11-07 15:45:43 +01:00
parent c5e61e32da
commit f32961f201
4 changed files with 168 additions and 4 deletions

View File

@ -1830,10 +1830,19 @@ class YoutubeDL(object):
return return
else: else:
try: try:
sub_data = ie._request_webpage( if sub_info.get('protocol') is not None:
sub_info['url'], info_dict['id'], note=False).read() sub_info_dict = {
with io.open(encodeFilename(sub_filename), 'wb') as subfile: 'id': info_dict['id'],
subfile.write(sub_data) 'protocol': sub_info['protocol'],
'url': sub_info['url']
}
sub_fd = get_suitable_downloader(sub_info_dict)(self, self.params)
sub_fd.download(sub_filename, sub_info_dict)
else:
sub_data = ie._request_webpage(
sub_info['url'], info_dict['id'], note=False).read()
with io.open(encodeFilename(sub_filename), 'wb') as subfile:
subfile.write(sub_data)
except (ExtractorError, IOError, OSError, ValueError) as err: except (ExtractorError, IOError, OSError, ValueError) as err:
self.report_warning('Unable to download subtitle for "%s": %s' % self.report_warning('Unable to download subtitle for "%s": %s' %
(sub_lang, error_to_compat_str(err))) (sub_lang, error_to_compat_str(err)))

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .common import FileDownloader from .common import FileDownloader
from .f4m import F4mFD from .f4m import F4mFD
from .hls import HlsFD from .hls import HlsFD
from .hls import WebVttHlsFD
from .http import HttpFD from .http import HttpFD
from .rtmp import RtmpFD from .rtmp import RtmpFD
from .dash import DashSegmentsFD from .dash import DashSegmentsFD
@ -20,6 +21,7 @@ from ..utils import (
PROTOCOL_MAP = { PROTOCOL_MAP = {
'rtmp': RtmpFD, 'rtmp': RtmpFD,
'm3u8_native': HlsFD, 'm3u8_native': HlsFD,
'm3u8_webvtt': WebVttHlsFD,
'm3u8': FFmpegFD, 'm3u8': FFmpegFD,
'mms': RtspFD, 'mms': RtspFD,
'rtsp': RtspFD, 'rtsp': RtspFD,

View File

@ -208,3 +208,153 @@ class HlsFD(FragmentFD):
self._finish_frag_download(ctx) self._finish_frag_download(ctx)
return True return True
class WebVttHlsFD(FragmentFD):
""" A downloader for HLS WebVTT subtitles. """
FD_NAME = 'hlswebvtt'
@staticmethod
def _parse_ts(ts):
m = re.match('(?:(?:([0-9]+):)?([0-9]+):)?([0-9]+)(?:\.([0-9]+))?', ts)
hrs, min, sec, msc = m.groups()
return 90 * (
int(hrs or 0) * 3600000 +
int(min or 0) * 60000 +
int(sec or 0) * 1000 +
int(msc or 0)
)
@staticmethod
def _format_ts(ts):
ts = int(ts / 90)
hrs = ts / 3600000
ts %= 3600000
min = ts / 60000
ts %= 60000
sec = ts / 1000
ts %= 1000
return '%02u:%02u:%02u.%03u' % (hrs, min, sec, ts)
def real_download(self, filename, info_dict):
url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
data = self.ydl.urlopen(url).read()
s = data.decode('utf-8', 'ignore')
segment_urls = []
for line in s.splitlines():
line = line.strip()
if line and not line.startswith('#'):
segment_url = (
line if re.match(r'^https?://', line)
else compat_urlparse.urljoin(url, line))
segment_urls.append(segment_url)
ctx = {
'filename': filename,
'total_frags': len(segment_urls),
}
self._prepare_and_start_frag_download(ctx)
cues = []
header = []
frags_filenames = []
for i, frag_url in enumerate(segment_urls):
frag_name = 'Frag%d' % i
frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)
success = ctx['dl'].download(frag_filename, {'url': frag_url})
if not success:
return False
down, frag_sanitized = sanitize_open(frag_filename, 'rb')
lines = down.read().decode('utf-8', 'ignore').splitlines()
down.close()
frags_filenames.append(frag_sanitized)
line_iter = iter(lines)
line = next(line_iter)
if not line.startswith('WEBVTT'):
self.report_error('Not a valid WebVTT subtitles segment')
if len(line) > 6 and not (line.startswith('WEBVTT ') or line.startswith('WEBVTT\t')):
self.report_error('Not a valid WebVTT subtitles segment')
try:
# read header
tsadj = 0
while True:
line = next(line_iter)
if line == '':
break
elif line.find('-->') != -1:
break
if line.startswith('X-TIMESTAMP-MAP='):
m = re.search(r'LOCAL:([0-9:.]+)', line)
locl_ts = self._parse_ts(m.group(1))
m = re.search(r'MPEGTS:([0-9]+)', line)
mpeg_ts = int(m.group(1))
tsadj = mpeg_ts - locl_ts
else:
header.append(line)
subtitle = None
while True:
while line == '':
line = next(line_iter)
cue = {}
if line.find('-->') == -1:
cue['id'] = line
line = next(line_iter)
if line == '':
continue
m = re.match(r'^([0-9:.]+\s*)-->\s*([0-9:.]+)(\s+.*)?', line)
if m:
ts_start = self._parse_ts(m.group(1))
ts_end = self._parse_ts(m.group(2))
cue['style'] = m.group(3) or ''
else:
continue
ts_start += tsadj
ts_end += tsadj
cue['start_ts'] = self._format_ts(ts_start)
cue['end_ts'] = self._format_ts(ts_end)
line = next(line_iter)
cue['text'] = ''
try:
while line != '':
if line.find('-->') != -1:
break
cue['text'] += line + '\n'
line = next(line_iter)
finally:
cues.append(cue)
except StopIteration:
pass
cues.sort(key=lambda cue: cue['start_ts'])
with ctx['dest_stream'] as outf:
outf.write(b'WEBVTT\n')
for item in header:
outf.write(('%s\n' % item).encode('utf-8'))
for cue in cues:
outf.write(b'\n')
if cue.get('id'):
outf.write(('%s\n' % cue['id']).encode('utf-8'))
outf.write(
('%s --> %s%s\n' % (cue['start_ts'], cue['end_ts'], cue['style']))
.encode('utf-8')
)
outf.write(cue['text'].encode('utf-8'))
self._finish_frag_download(ctx)
for frag_file in frags_filenames:
os.remove(encodeFilename(frag_file))

View File

@ -1667,6 +1667,9 @@ class InfoExtractor(object):
'url': media['URI'], 'url': media['URI'],
'ext': determine_ext(media['URI']) 'ext': determine_ext(media['URI'])
} }
if sub_info['ext'] == 'm3u8': # XXX
sub_info['ext'] = 'vtt'
sub_info['protocol'] = 'm3u8_webvtt'
subtitles.setdefault(lang, []).append(sub_info) subtitles.setdefault(lang, []).append(sub_info)
if media_type not in ('VIDEO', 'AUDIO'): if media_type not in ('VIDEO', 'AUDIO'):
return return