From 8e1b235c0ec311e114e392003839f9d06314a969 Mon Sep 17 00:00:00 2001 From: LexManos Date: Fri, 10 Jan 2020 00:31:55 -0800 Subject: [PATCH 1/2] Add bulk support for go. --- youtube_dl/extractor/common.py | 34 ++++++++++++++++++++++++++++++++++ youtube_dl/extractor/go.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484..c562c6153 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1007,6 +1007,40 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_regex_all(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): + """ + The same as _search_regex, except will return all matches for all patterns instead of just one + """ + ret = [] + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + for match in re.finditer(pattern, string, flags): + if group is None: + ret.append(next(g for g in match.groups() if g is not None)) + else: + ret.append(match.group(group)) + else: + for p in pattern: + for match in re.finditer(p, string, flags): + if group is None: + ret.append(next(g for g in match.groups() if g is not None)) + else: + ret.append(match.group(group)) + + if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): + _name = '\033[0;34m%s\033[0m' % name + else: + _name = name + + if len(ret) > 0: + return ret + elif default is not NO_DEFAULT: + return default + elif fatal: + raise RegexNotFoundError('Unable to extract %s' % _name) + else: + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) + return None + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 03cfba91f..40707ac6e 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -132,14 +132,21 @@ class GoIE(AdobePassIE): brand = site_info.get('brand') if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) - video_id = self._search_regex( + video_id = self._search_regex_all( ( # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood r'data-video-id=["\']*(VDKA\w+)', # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) + ), webpage, 'video id', default=[video_id]) + + # Remove duplicates and nulls + if video_id: + tmp = [] + [tmp.append(x) for x in video_id if x and x not in tmp] + video_id = tmp + if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', @@ -160,6 +167,23 @@ class GoIE(AdobePassIE): video['url'], 'Go', video.get('id'), video.get('title'))) entries.reverse() return self.playlist_result(entries, show_id, show_title) + + if not isinstance(video_id, list): + video_id = [video_id] + + entries = [] + for id in video_id: + entry = self._real_extract_single(id, site_info, brand) + if entry: + entries.append(entry) + + if len(entries) == 0: + return None + elif len(entries) == 1: + return entries[0] + return self.playlist_result(entries) + + def _real_extract_single(self, video_id, site_info, brand): video_data = self._extract_videos(brand, video_id)[0] video_id = video_data['id'] title = video_data['title'] From 58fbc02cc6d70b26b4004d79a992a13beb740a1e Mon Sep 17 00:00:00 2001 From: LexManos Date: Fri, 10 Jan 2020 12:24:04 -0800 Subject: [PATCH 2/2] Rank flake8 --- youtube_dl/extractor/go.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 40707ac6e..1f784ed15 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -140,13 +140,13 @@ class GoIE(AdobePassIE): # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' ), webpage, 'video id', default=[video_id]) - + # Remove duplicates and nulls if video_id: tmp = [] [tmp.append(x) for x in video_id if x and x not in tmp] video_id = tmp - + if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', @@ -167,23 +167,23 @@ class GoIE(AdobePassIE): video['url'], 'Go', video.get('id'), video.get('title'))) entries.reverse() return self.playlist_result(entries, show_id, show_title) - + if not isinstance(video_id, list): video_id = [video_id] - + entries = [] for id in video_id: - entry = self._real_extract_single(id, site_info, brand) + entry = self._real_extract_single(url, id, site_info, brand) if entry: entries.append(entry) - + if len(entries) == 0: return None elif len(entries) == 1: return entries[0] return self.playlist_result(entries) - - def _real_extract_single(self, video_id, site_info, brand): + + def _real_extract_single(self, url, video_id, site_info, brand): video_data = self._extract_videos(brand, video_id)[0] video_id = video_data['id'] title = video_data['title']