From 056c797523bc05123f04356bb9cf7bcc597e3ef1 Mon Sep 17 00:00:00 2001 From: fnord Date: Wed, 15 Jul 2015 04:22:53 -0500 Subject: [PATCH 1/4] dfxp2srt: do not translate '\n's to line breaks, only
and

--- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 942f76d24..68d2f4984 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1865,7 +1865,7 @@ def dfxp2srt(dfxp_data): return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + dfxp = xml.etree.ElementTree.fromstring(dfxp_data.replace('\n','').encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') From 7db0a3d1bc3e33972603bad07cb08ec9c273c9d9 Mon Sep 17 00:00:00 2001 From: fnord Date: Wed, 15 Jul 2015 04:29:10 -0500 Subject: [PATCH 2/4] dfxp2srt: Fix disappearing words after s ( '... this-goes-missing' ). Ensure trailing whitespace/newlines are not added. --- youtube_dl/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 68d2f4984..485408baa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1850,6 +1850,12 @@ def dfxp2srt(dfxp_data): 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', }) + + def text_or_empty(v): + str = str_or_none(v, '') + return '' if not re.search(r'[^\s]',str,re.DOTALL) else str + + def parse_node(node): str_or_empty = functools.partial(str_or_none, default='') @@ -1859,7 +1865,7 @@ def dfxp2srt(dfxp_data): if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): out += '\n' + str_or_empty(child.tail) elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): - out += str_or_empty(parse_node(child)) + out += str_or_empty(parse_node(child)) + text_or_empty(child.tail) else: out += str_or_empty(xml.etree.ElementTree.tostring(child)) From e6b0b86eb2101a6db945febb25c20bc895a76d4d Mon Sep 17 00:00:00 2001 From: fnord Date: Wed, 15 Jul 2015 05:59:33 -0500 Subject: [PATCH 3/4] explain purpose of text_or_none --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 485408baa..5a9c1ea3a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1852,6 +1852,7 @@ def dfxp2srt(dfxp_data): def text_or_empty(v): + """ return string that contains something other than whitespace, or '' """ str = str_or_none(v, '') return '' if not re.search(r'[^\s]',str,re.DOTALL) else str From 8f057ac0bd638f1f3a422da36befcd23da383423 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 03:05:24 -0500 Subject: [PATCH 4/4] dfxp: fix syntax --- youtube_dl/utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 5a9c1ea3a..05d1f82f3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1850,12 +1850,10 @@ def dfxp2srt(dfxp_data): 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', }) - def text_or_empty(v): """ return string that contains something other than whitespace, or '' """ str = str_or_none(v, '') - return '' if not re.search(r'[^\s]',str,re.DOTALL) else str - + return '' if not re.search(r'[^\s]', str, re.DOTALL) else str def parse_node(node): str_or_empty = functools.partial(str_or_none, default='') @@ -1872,7 +1870,7 @@ def dfxp2srt(dfxp_data): return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.replace('\n','').encode('utf-8')) + dfxp = xml.etree.ElementTree.fromstring(dfxp_data.replace('\n', '').encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')