From 300a9da662dca1441eeab0330f02a85c21f46f0c Mon Sep 17 00:00:00 2001 From: uno20001 <> Date: Sat, 2 Nov 2019 16:25:27 +0100 Subject: [PATCH 1/3] [youtube_dl/utils] add support for octal escape sequences to js_to_json() --- youtube_dl/utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index aed988b88..85fe1e34a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3979,18 +3979,27 @@ def js_to_json(code): def fix_kv(m): v = m.group(0) + if v in ('true', 'false', 'null'): return v elif v.startswith('/*') or v.startswith('//') or v == ',': return "" if v[0] in ("'", '"'): + + v = v[1:-1] + + # convert Javascript's octal escape sequences (and '\0') + # into valid JSON escape sequences ( e.g '\347' => '\u00e7', '\0' => '\u0000' ) + # regex based on https://mathiasbynens.be/notes/javascript-escapes + v = re.sub(r'\\([0-7]{1,3})', lambda x: "\\u%04x" % int(x.group(1), 8), v) + v = re.sub(r'(?s)\\.|"', lambda m: { '"': '\\"', "\\'": "'", '\\\n': '', '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) + }.get(m.group(0), m.group(0)), v) for regex, base in INTEGER_TABLE: im = re.match(regex, v) @@ -4001,8 +4010,8 @@ def js_to_json(code): return '"%s"' % v return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| + "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]?))*[^"\\]*"| + '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]?))*[^'\\]*'| {comment}|,(?={skip}[\]}}])| (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| From b812fcee81e707eaeba2fe2353033887f52e409f Mon Sep 17 00:00:00 2001 From: uno20001 <> Date: Sat, 2 Nov 2019 16:25:58 +0100 Subject: [PATCH 2/3] [EnseignerTV5Monde] new extractor (closes #22851) --- youtube_dl/extractor/enseignertv5monde.py | 44 +++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 45 insertions(+) create mode 100644 youtube_dl/extractor/enseignertv5monde.py diff --git a/youtube_dl/extractor/enseignertv5monde.py b/youtube_dl/extractor/enseignertv5monde.py new file mode 100644 index 000000000..a08632dd9 --- /dev/null +++ b/youtube_dl/extractor/enseignertv5monde.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class EnseignerTV5MondeIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)?enseigner\.tv5monde\.com\/videos-sans-fiche\/(?P<id>[a-zA-Z0-9\-]+)' + _TESTS = [ + { + 'url': 'https://enseigner.tv5monde.com/videos-sans-fiche/la-culture-en-france', + 'md5': 'bb9e4c4701c1873a3790a0a33eb89ce6', + 'info_dict': { + 'id': 'la-culture-en-france', + 'ext': 'mp4', + 'title': 'La culture en France', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, + { + 'url': 'https://enseigner.tv5monde.com/videos-sans-fiche/les-chapeaux-de-la-maison-michel', + 'md5': 'bdfc21506aee0ffa2afd823f1c44ce66', + 'info_dict': { + 'id': 'les-chapeaux-de-la-maison-michel', + 'ext': 'mp4', + 'title': 'Les chapeaux de la Maison Michel', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + d = self._extract_jwplayer_data(webpage, video_id, require_title=False) + + d.update({ + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'release_date': unified_strdate(self._html_search_regex(r'itemprop="datePublished"[^>]*>([0-9/]+)</time>', webpage, 'release date', fatal=False))}) + + return d diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index af3fff601..06c546add 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -321,6 +321,7 @@ from .ellentube import ( from .elpais import ElPaisIE from .embedly import EmbedlyIE from .engadget import EngadgetIE +from .enseignertv5monde import EnseignerTV5MondeIE from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE From 1ebe09fb349a70954347d85f6aa2fcb4d0e0f3ad Mon Sep 17 00:00:00 2001 From: uno20001 <> Date: Sat, 2 Nov 2019 20:47:40 +0100 Subject: [PATCH 3/3] [youtube_dl/utils] fix failed test --- youtube_dl/utils.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 85fe1e34a..0cb410ac7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3976,6 +3976,23 @@ def js_to_json(code): (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), ) + + def convert_escapes(m): + # convert Javascript's octal escape sequences (and '\0') + # into valid JSON escape sequences ( e.g '\347' => '\u00e7', '\0' => '\u0000' + if m.group(1): + return "\\u%04x" % int(m.group(1), 8) + + # convert the remaining escape sequences + # into valid JSON + return { + '"': '\\"', + "\\'": "'", + '\\\n': '', + '\\x': '\\u00', + }.get(m.group(0), m.group(0)) + + def fix_kv(m): v = m.group(0) @@ -3986,20 +4003,7 @@ def js_to_json(code): return "" if v[0] in ("'", '"'): - - v = v[1:-1] - - # convert Javascript's octal escape sequences (and '\0') - # into valid JSON escape sequences ( e.g '\347' => '\u00e7', '\0' => '\u0000' ) - # regex based on https://mathiasbynens.be/notes/javascript-escapes - v = re.sub(r'\\([0-7]{1,3})', lambda x: "\\u%04x" % int(x.group(1), 8), v) - - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v) + v = re.sub(r'(?s)\\(?:([0-7]{1,3})|.)|"', convert_escapes, v[1:-1]) for regex, base in INTEGER_TABLE: im = re.match(regex, v)