From d04d262af8d6044b08194a92c6d07804b88381e9 Mon Sep 17 00:00:00 2001 From: jesus Date: Thu, 14 Mar 2019 16:13:37 +0100 Subject: [PATCH 1/3] [generic] Look at all LD-JSON blobs, not just the first one --- youtube_dl/extractor/common.py | 37 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dfd0584d3..be5c9a47f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -965,27 +965,41 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None, return_all=False): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. In case of failure return a default value or raise a WARNING or a RegexNotFoundError, depending on fatal, specifying the field name. """ + matches = [] + if isinstance(pattern, (str, compat_str, compiled_regex_type)): - mobj = re.search(pattern, string, flags) + if return_all: + matches = list(re.finditer(pattern, string, flags)) + else: + mobj = re.search(pattern, string, flags) else: for p in pattern: - mobj = re.search(p, string, flags) - if mobj: - break + if return_all: + new_matches = list(re.finditer(p, string, flags)) + matches.extend(new_matches) + else: + mobj = re.search(p, string, flags) + if mobj: + break if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name - if mobj: + if return_all and len(matches) > 0: + if group is None: + return list(map(lambda m: next(g for g in m.groups() if g is not None), matches)) + else: + return list(map(lambda m: m.group(group), matches)) + elif mobj: if group is None: # return the first matching group return next(g for g in mobj.groups() if g is not None) @@ -1174,16 +1188,19 @@ class InfoExtractor(object): 'twitter card player') def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld = self._search_regex( - JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) + json_lds = self._search_regex( + JSON_LD_RE, html, 'JSON-LD', group='json_ld', return_all=True, **kwargs) default = kwargs.get('default', NO_DEFAULT) - if not json_ld: + if not json_lds or len(json_lds) == 0: return default if default is not NO_DEFAULT else {} # JSON-LD may be malformed and thus `fatal` should be respected. # At the same time `default` may be passed that assumes `fatal=False` # for _search_regex. Let's simulate the same behavior here as well. fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False - return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) + for json_ld in json_lds: + found = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) + if found: + return found def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): From 8813775c9857fb2e65e47361ddd9d21c077366d8 Mon Sep 17 00:00:00 2001 From: jesus Date: Thu, 14 Mar 2019 16:28:35 +0100 Subject: [PATCH 2/3] [generic] Add test for multiple LD-JSON objects --- youtube_dl/extractor/generic.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6f48b04da..97d30b658 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2101,6 +2101,19 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to download MPD manifest'], }, + { + # page with multiple LD-JSON objects + 'url': 'https://www.antena3.com/noticias/sociedad/grupo-personas-descuelga-pancarta-lazo-amarillo-ayuntamiento-barcelona-video_201903145c8a02c70cf2b779bc2dca9b.html', + 'md5': '5a9a0d6c788f0a4cee05d2c077b3637c', + 'info_dict': { + 'id': 'grupo-personas-descuelga-pancarta-lazo-amarillo-ayuntamiento-barcelona-video_201903145c8a02c70cf2b779bc2dca9b', + 'ext': 'mp4', + 'title': 'Un grupo de personas descuelga una pancarta con un lazo amarillo del Ayuntamiento de Barcelona y operarios la vuelven a colocar', + 'timestamp': 1552548539, + 'description': 'La retirada de la pancarta con el lazo amarillo en la fachada del Ayuntamiento de Barcelona ha durado poco. Tras su retirada por parte de un grupo de personas esta madrugada un grupo de operarios ha vuelto a colocarla. ', + 'upload_date': '20190314', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject From a01e16e5fd8fad5fc495cc1b6bfa0e0a0b64cfca Mon Sep 17 00:00:00 2001 From: jesus Date: Thu, 14 Mar 2019 19:45:56 +0100 Subject: [PATCH 3/3] [generic] Create new _search_regex_all method that returns all matches --- youtube_dl/extractor/common.py | 64 ++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index be5c9a47f..768fd8142 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -965,41 +965,27 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None, return_all=False): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. In case of failure return a default value or raise a WARNING or a RegexNotFoundError, depending on fatal, specifying the field name. """ - matches = [] - if isinstance(pattern, (str, compat_str, compiled_regex_type)): - if return_all: - matches = list(re.finditer(pattern, string, flags)) - else: - mobj = re.search(pattern, string, flags) + mobj = re.search(pattern, string, flags) else: for p in pattern: - if return_all: - new_matches = list(re.finditer(p, string, flags)) - matches.extend(new_matches) - else: - mobj = re.search(p, string, flags) - if mobj: - break + mobj = re.search(p, string, flags) + if mobj: + break if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name - if return_all and len(matches) > 0: - if group is None: - return list(map(lambda m: next(g for g in m.groups() if g is not None), matches)) - else: - return list(map(lambda m: m.group(group), matches)) - elif mobj: + if mobj: if group is None: # return the first matching group return next(g for g in mobj.groups() if g is not None) @@ -1013,6 +999,40 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_regex_all(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. Returns all matches. + In case of failure return a default value or raise a WARNING or a + RegexNotFoundError, depending on fatal, specifying the field name. + """ + matches = [] + + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + matches = list(re.finditer(pattern, string, flags)) + else: + for p in pattern: + new_matches = list(re.finditer(p, string, flags)) + matches.extend(new_matches) + + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): + _name = '\033[0;34m%s\033[0m' % name + else: + _name = name + + if len(matches) > 0: + if group is None: + return list(map(lambda m: next(g for g in m.groups() if g is not None), matches)) + else: + return list(map(lambda m: m.group(group), matches)) + elif default is not NO_DEFAULT: + return default + elif fatal: + raise RegexNotFoundError('Unable to extract %s' % _name) + else: + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) + return None + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. @@ -1188,10 +1208,10 @@ class InfoExtractor(object): 'twitter card player') def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_lds = self._search_regex( + json_lds = self._search_regex_all( JSON_LD_RE, html, 'JSON-LD', group='json_ld', return_all=True, **kwargs) default = kwargs.get('default', NO_DEFAULT) - if not json_lds or len(json_lds) == 0: + if not json_lds: return default if default is not NO_DEFAULT else {} # JSON-LD may be malformed and thus `fatal` should be respected. # At the same time `default` may be passed that assumes `fatal=False`