From 9a2ef3f7fdaf84150d5124cbf58917d292b6997d Mon Sep 17 00:00:00 2001 From: sulyi Date: Sun, 20 Nov 2016 22:53:48 +0100 Subject: [PATCH 1/9] [utils] Fixing js_to_json --- youtube_dl/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9595bcf9f..877879446 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2109,10 +2109,13 @@ def js_to_json(code): return '"%s"' % v + # fixing , followed nothing, but comments + # fixing unnecessary ? in /\*.*?\*/ + # fixing greedy comment return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*.*?\*/|,(?=\s*[\]}])| + /\*[^*]*\*/|,(?=(\s|(/\*[^*]*\*/))*[\]}])| [a-zA-Z_][.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| [0-9]+(?=\s*:) From c1ffe56be750a069ee14ba38ece97b59f47d2833 Mon Sep 17 00:00:00 2001 From: sulyi Date: Sun, 20 Nov 2016 22:56:27 +0100 Subject: [PATCH 2/9] [jsinterp] Adding extract_arguments --- youtube_dl/jsinterp.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 9737f7002..d6ae43b94 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -243,6 +243,27 @@ class JSInterpreter(object): return self.build_function(argnames, func_m.group('code')) + def extract_arguments(self, call): + pattern = re.escape(call) if call.endswith(')') else r'%s\s*\(' % re.escape(call) + call_m = re.search(pattern, self.code) + + if call_m is None: + raise ExtractorError('Could not find JS call %r' % call) + # XXX: context-free! + close_pos = open_pos = call_m.end() + counter = 1 + while counter > 0: + if close_pos > len(self.code): + raise ExtractorError('Runaway argument found of JS call %r' % call) + c = self.code[close_pos] + close_pos += 1 + if c == '(': + counter += 1 + elif c == ')': + counter -= 1 + else: + return self.code[open_pos:close_pos - 1] + def call_function(self, funcname, *args): f = self.extract_function(funcname) return f(args) From d6ba53417a141c0ab05039be3d7597f57c9a232b Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 21 Nov 2016 07:18:33 +0100 Subject: [PATCH 3/9] [utils] Fixing js_to_json * or / in comment --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 877879446..3a14048c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2115,7 +2115,7 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*[^*]*\*/|,(?=(\s|(/\*[^*]*\*/))*[\]}])| + /\*((?!\*/)\n|.)*\*/|,(?=(\s|(/\*((?!\*/)\n|.)*\*/))*[\]}])| [a-zA-Z_][.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| [0-9]+(?=\s*:) From d13093bed37a17e96225eb587deacf3cd370f328 Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 21 Nov 2016 07:23:22 +0100 Subject: [PATCH 4/9] [utils] Rebalance of pattern in js_to_json --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3a14048c9..bda59e627 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2115,8 +2115,8 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*((?!\*/)\n|.)*\*/|,(?=(\s|(/\*((?!\*/)\n|.)*\*/))*[\]}])| - [a-zA-Z_][.a-zA-Z_0-9]*| + ,(?=(\s|(/\*((?!\*/)\n|.)*\*/))*[\]}])| + /\*((?!\*/)\n|.)*\*/|[a-zA-Z_][.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| [0-9]+(?=\s*:) ''', fix_kv, code) From cc17865bc4c9bbc089362638237e31e9a83ffd31 Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 21 Nov 2016 11:33:57 +0100 Subject: [PATCH 5/9] [jsinterp] Adding in_string state to extract_arguments --- youtube_dl/jsinterp.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 72b63ca43..e12238911 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -252,15 +252,20 @@ class JSInterpreter(object): # XXX: context-free! close_pos = open_pos = call_m.end() counter = 1 + in_string = '' while counter > 0: if close_pos > len(self.code): raise ExtractorError('Runaway argument found of JS call %r' % call) c = self.code[close_pos] close_pos += 1 - if c == '(': + if c == '(' and not in_string: counter += 1 - elif c == ')': + elif c == ')' and not in_string: counter -= 1 + elif in_string and c == in_string: + in_string = '' + elif c in ['"', '\'']: + in_string = c else: return self.code[open_pos:close_pos - 1] From 1736a724389907a8fd9d78958c06a12d4df19f82 Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 21 Nov 2016 11:36:38 +0100 Subject: [PATCH 6/9] [jsinterp] Handling comments - my first try --- test/test_jsinterp.py | 2 +- youtube_dl/jsinterp.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index c24b8ca74..310902e12 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -74,7 +74,7 @@ class TestJSInterpreter(unittest.TestCase): def test_comments(self): 'Skipping: Not yet fully implemented' - return + # return jsi = JSInterpreter(''' function x() { var x = /* 1 + */ 2; diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index e12238911..6029b4204 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -241,7 +241,23 @@ class JSInterpreter(object): raise ExtractorError('Could not find JS function %r' % funcname) argnames = func_m.group('args').split(',') - return self.build_function(argnames, func_m.group('code')) + def validate_token(m): + if m.group(0).startswith('/*') and m.group(0).endswith('*/'): + return '' + elif (m.group(0).startswith('"') and m.group(0).endswith('"') or + m.group(0).startswith('\'') and m.group(0).endswith('\'')): + return m.group(0) + else: + # This shouldn't happen + return m.group(0) + + # no comment + code = re.sub(r'''(?sx) + "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| + /\*(?:(?!\*/)(?:\n|.))*\*/''', validate_token, func_m.group('code')) + + return self.build_function(argnames, code) def extract_arguments(self, call): pattern = re.escape(call) if call.endswith(')') else r'%s\s*\(' % re.escape(call) From e6e9e1f45bc28c597ec43591fbb5bbd25b668720 Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 21 Nov 2016 11:39:53 +0100 Subject: [PATCH 7/9] [utils] Fixing runaway comments in js_to_json --- youtube_dl/utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bda59e627..bcea188a3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2109,14 +2109,11 @@ def js_to_json(code): return '"%s"' % v - # fixing , followed nothing, but comments - # fixing unnecessary ? in /\*.*?\*/ - # fixing greedy comment return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - ,(?=(\s|(/\*((?!\*/)\n|.)*\*/))*[\]}])| - /\*((?!\*/)\n|.)*\*/|[a-zA-Z_][.a-zA-Z_0-9]*| + ,(?=(\s|(/\*((?!\*/)(\n|.))*\*/))*[\]}])| + /\*((?!\*/)(\n|.))*\*/|[a-zA-Z_][.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| [0-9]+(?=\s*:) ''', fix_kv, code) From 321045a3d56e08879076b3c5cdb0f86bd1e43237 Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 21 Nov 2016 11:44:07 +0100 Subject: [PATCH 8/9] [utils] Using non capturing groups in js_to_json --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bcea188a3..548ce12e2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2112,8 +2112,8 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - ,(?=(\s|(/\*((?!\*/)(\n|.))*\*/))*[\]}])| - /\*((?!\*/)(\n|.))*\*/|[a-zA-Z_][.a-zA-Z_0-9]*| + ,(?=(?:\s|(?:/\*(?:(?!\*/)(?:\n|.))*\*/))*[\]}])| + /\*(?:(?!\*/)(?:\n|.))*\*/|[a-zA-Z_][.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| [0-9]+(?=\s*:) ''', fix_kv, code) From d88d4535be1d7975346b54f4fffa138a51a98d5f Mon Sep 17 00:00:00 2001 From: sulyi Date: Mon, 21 Nov 2016 11:44:55 +0100 Subject: [PATCH 9/9] [utils] Adding test_js_to_json_landofoz --- test/test_utils.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 2e3cd0179..c9a3a851d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -743,6 +743,26 @@ class TestUtil(unittest.TestCase): inp = '''{"duration": "00:01:07"}''' self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''') + def test_js_to_json_landofoz(self): + inp = '''{ + character: { + name: "Dorothy", + pet: "Toto", + /* source: "Kansas", + destination: "Emerald City", + roll: "heroine" */ + }, + comment: /* over the rainbow */ "/*", + no_comment: "*/" + }''' + self.assertEqual(js_to_json(inp), '''{ + "character": { + "name": "Dorothy", + "pet": "Toto"\n \n }, + "comment": "/*", + "no_comment": "*/" + }''') + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})