From faa5b2ed5463c815be7ce4b48f0fb197f1978744 Mon Sep 17 00:00:00 2001 From: Forthrin Date: Wed, 8 May 2019 13:27:37 +0200 Subject: [PATCH] [postprocessor/ffmpeg] Support for DCSubtitle (XML) format --- youtube_dl/postprocessor/ffmpeg.py | 10 ++++++++-- youtube_dl/utils.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5bcb00ac0..3c29045d6 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -19,6 +19,7 @@ from ..utils import ( shell_quote, subtitles_filename, dfxp2srt, + dc2srt, ISO639Utils, replace_extension, ) @@ -610,7 +611,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext) - if ext in ('dfxp', 'ttml', 'tt'): + if ext in ('dfxp', 'ttml', 'tt', 'xml'): self._downloader.report_warning( 'You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') @@ -619,7 +620,12 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): srt_file = subtitles_filename(filename, lang, 'srt') with open(dfxp_file, 'rb') as f: - srt_data = dfxp2srt(f.read()) + file = f.read() + + if ext == 'xml': + srt_data = dc2srt(file) + else: + srt_data = dfxp2srt(file) with io.open(srt_file, 'wt', encoding='utf-8') as f: f.write(srt_data) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 71713f63a..5069e170f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2721,6 +2721,35 @@ def match_filter_func(filter_str): return _match_func +def dc_time_to_srt_time(dc_time): + return '{0:}:{1:}:{2:},{3:}'.format(*dc_time.split(':')) + + +def parse_dc_subtitles(dc): + subs = [] + root = xml.etree.ElementTree.fromstring(dc) + font = root.find('Font') + for subtitle in font.findall('Subtitle'): + subs.append({ + 'number': subtitle.attrib['SpotNumber'], + 'start': subtitle.attrib['TimeIn'], + 'end': subtitle.attrib['TimeOut'], + 'text': '\n'.join([text.text for text in subtitle.findall('Text')]), + }) + return subs + + +def dc2srt(dc): + subs = parse_dc_subtitles(dc) + srt = [] + for sub in subs: + srt.append(sub['number']) + srt.append(dc_time_to_srt_time(sub['start']) + ' --> ' + dc_time_to_srt_time(sub['end'])) + srt.append(sub['text']) + srt.append('') + return '\n'.join(srt) + + def parse_dfxp_time_expr(time_expr): if not time_expr: return