From 8dfc27b7d305bc7e74c2250fb5080a67071de7ea Mon Sep 17 00:00:00 2001 From: felix Date: Sun, 13 Mar 2016 12:29:15 +0100 Subject: [PATCH 1/2] [utils] js_to_json: various improvements now JS object literals like { /* " */ 0: ",]\xaa<\/p>", } will be correctly converted to JSON. --- test/test_utils.py | 12 ++++++++++++ youtube_dl/utils.py | 30 ++++++++++++++++-------------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a35debfe1..db189f1f6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -631,6 +631,18 @@ class TestUtil(unittest.TestCase): on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + on = js_to_json('{ 0: /* " \n */ ",]" , }') + self.assertEqual(json.loads(on), {'0': ',]'}) + + on = js_to_json(r'["

x<\/p>"]') + self.assertEqual(json.loads(on), ['

x

']) + + on = js_to_json(r'["\xaa"]') + self.assertEqual(json.loads(on), ['\u00aa']) + + on = js_to_json("['a\\\nb']") + self.assertEqual(json.loads(on), ['ab']) + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b6e1dc809..1400ff2db 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1847,24 +1847,26 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - if v.startswith('"'): - v = re.sub(r"\\'", "'", v[1:-1]) - elif v.startswith("'"): - v = v[1:-1] - v = re.sub(r"\\\\|\\'|\"", lambda m: { - '\\\\': '\\\\', - "\\'": "'", + elif v.startswith('/*') or v == ',': + return "" + + if v[0] in ("'", '"'): + v = re.sub(r'(?s)\\.|"', lambda m: { '"': '\\"', - }[m.group(0)], v) + "\\'": "'", + '\\\n': '', + '\\x': '\\u00', + }.get(m.group(0), m.group(0)), v[1:-1]) + return '"%s"' % v - res = re.sub(r'''(?x) - "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'| - [a-zA-Z_][.a-zA-Z_0-9]* + return re.sub(r'''(?sx) + "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| + /\*.*?\*/|,(?=\s*[\]}])| + [a-zA-Z_][.a-zA-Z_0-9]*| + [0-9]+(?=\s*:) ''', fix_kv, code) - res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res) - return res def qualities(quality_ids): From 2f481215d95c738dad530349392f83a488ea145b Mon Sep 17 00:00:00 2001 From: felix Date: Sun, 20 Mar 2016 12:17:57 +0100 Subject: [PATCH 2/2] [ora] minimise fragile regex shenanigans; recognise unsafespeech.com URLs --- youtube_dl/extractor/ora.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py index 8545fb1b8..cfae71bcc 100644 --- a/youtube_dl/extractor/ora.py +++ b/youtube_dl/extractor/ora.py @@ -6,13 +6,14 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( get_element_by_attribute, + js_to_json, qualities, unescapeHTML, ) class OraTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P[^/\?#]+)' + _VALID_URL = r'https?://(?:www\.)?(ora\.tv|unsafespeech\.com)/([^/]+/)*(?P[^/\?#]+)' _TEST = { 'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq', 'md5': 'fa33717591c631ec93b04b0e330df786', @@ -28,10 +29,13 @@ class OraTVIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_data = self._search_regex( - r'"(?:video|current)"\s*:\s*({[^}]+?})', webpage, 'current video') - m3u8_url = self._search_regex( - r'hls_stream"?\s*:\s*"([^"]+)', video_data, 'm3u8 url', None) + ora_meta = self._parse_json(self._search_regex( + r'(?s);\s*ora_meta = ({.*?});', webpage, 'ora_meta'), display_id, + transform_source=lambda data: js_to_json(re.sub('":(document|\().*?(:false|\(\)),', '":null,', data))) + + video_data = ora_meta.get('video', ora_meta.get('current')) + m3u8_url = video_data['hls_stream'] + if m3u8_url: formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', 'm3u8_native', @@ -60,13 +64,11 @@ class OraTVIE(InfoExtractor): r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube') return { - 'id': self._search_regex( - r'"id"\s*:\s*(\d+)', video_data, 'video id', default=display_id), + 'id': video_data.get('id', display_id), 'display_id': display_id, 'title': unescapeHTML(self._og_search_title(webpage)), 'description': get_element_by_attribute( 'class', 'video_txt_decription', webpage), - 'thumbnail': self._proto_relative_url(self._search_regex( - r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)), + 'thumbnail': self._proto_relative_url(video_data.get('thumb')), 'formats': formats, }