From 33f25ad94d8cd445e80d544d26dc017577d33416 Mon Sep 17 00:00:00 2001 From: Amram Oren Titane Date: Sun, 18 Jun 2017 18:48:19 -0400 Subject: [PATCH] Regrouped most '_type: url' and '_type: playlist' in a single multidimentional playlist (entries_for_batch) for webpages batch processing --- youtube_dl/extractor/generic.py | 268 ++++++++++++++++---------------- 1 file changed, 138 insertions(+), 130 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8ef1a2980..40e60314d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1827,11 +1827,13 @@ class GenericIE(InfoExtractor): } def _real_extract(self, url): + entries_for_batch = []; + if url.startswith('//'): - return { + entries_for_batch.append({ '_type': 'url', 'url': self.http_scheme() + url, - } + }) parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: @@ -1842,7 +1844,7 @@ class GenericIE(InfoExtractor): if default_search in ('auto', 'auto_warning', 'fixup_error'): if '/' in url: self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') - return self.url_result('http://' + url) + entries_for_batch.append(self.url_result('http://' + url)) elif default_search != 'fixup_error': if default_search == 'auto_warning': if re.match(r'^(?:url|URL)$', url): @@ -1852,7 +1854,7 @@ class GenericIE(InfoExtractor): else: self._downloader.report_warning( 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url) - return self.url_result('ytsearch:' + url) + entries_for_batch.append(self.url_result('ytsearch:' + url)) if default_search in ('error', 'fixup_error'): raise ExtractorError( @@ -1862,7 +1864,7 @@ class GenericIE(InfoExtractor): else: if ':' not in default_search: default_search += ':' - return self.url_result(default_search + url) + entries_for_batch.append(self.url_result(default_search + url)) url, smuggled_data = unsmuggle_url(url) force_videoid = None @@ -1889,7 +1891,7 @@ class GenericIE(InfoExtractor): if force_videoid: new_url = smuggle_url( new_url, {'force_videoid': force_videoid}) - return self.url_result(new_url) + entries_for_batch.append(self.url_result(new_url)) full_response = None if head_response is False: @@ -1970,7 +1972,7 @@ class GenericIE(InfoExtractor): try: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': - return self._extract_rss(url, video_id, doc) + entries_for_batch.append(self._extract_rss(url, video_id, doc)) elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'] = self._parse_ism_formats(doc, url) self._sort_formats(info_dict['formats']) @@ -1980,7 +1982,7 @@ class GenericIE(InfoExtractor): self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': - return self.playlist_result(self._parse_xspf(doc, video_id), video_id) + entries_for_batch.append(self.playlist_result(self._parse_xspf(doc, video_id), video_id)) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, video_id, @@ -1998,7 +2000,7 @@ class GenericIE(InfoExtractor): # Is it a Camtasia project? camtasia_res = self._extract_camtasia(url, video_id, webpage) if camtasia_res is not None: - return camtasia_res + entries_for_batch.append(camtasia_res) # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/rg3/youtube-dl/issues/2448) @@ -2042,44 +2044,44 @@ class GenericIE(InfoExtractor): 'ie_key': 'BrightcoveLegacy' } for bc_url in bc_urls] - return { + entries_for_batch.append({ '_type': 'playlist', 'title': video_title, 'id': video_id, 'entries': entries, - } + }) # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(self, webpage) if bc_urls: - return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') + entries_for_batch.append(self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')) # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') + entries_for_batch.append(self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')) # Look for Vessel embeds vessel_urls = VesselIE._extract_urls(webpage) if vessel_urls: - return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) + entries_for_batch.append(self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())) # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') + entries_for_batch.append(self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')) vimeo_urls = VimeoIE._extract_urls(url, webpage) if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) + entries_for_batch.append(self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())) vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', webpage, 'vid.me embed', default=None) if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') + entries_for_batch.append(self.url_result(vid_me_embed_url, 'Vidme')) # Look for embedded YouTube player matches = re.findall(r'''(?x) @@ -2096,6 +2098,7 @@ class GenericIE(InfoExtractor): (?:embed|v|p)/.+?) \1''', webpage) if matches: + # can't remove return else errors return self.playlist_from_matches( matches, video_id, video_title, lambda m: unescapeHTML(m[1])) @@ -2103,18 +2106,18 @@ class GenericIE(InfoExtractor): matches = re.findall( r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) + entries_for_batch.append(self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))) # Look for Wordpress "YouTube Video Importer" plugin matches = re.findall(r'''(?x)]+ class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) + entries_for_batch.append(self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])) matches = DailymotionIE._extract_urls(webpage) if matches: - return self.playlist_from_matches(matches, video_id, video_title) + entries_for_batch.append(self.playlist_from_matches(matches, video_id, video_title)) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -2123,56 +2126,56 @@ class GenericIE(InfoExtractor): playlists = re.findall( r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) + entries_for_batch.append(self.playlist_from_matches( + playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)) # Look for embedded Wistia player wistia_url = WistiaIE._extract_url(webpage) if wistia_url: - return { + entries_for_batch.append({ '_type': 'url_transparent', 'url': self._proto_relative_url(wistia_url), 'ie_key': WistiaIE.ie_key(), 'uploader': video_uploader, - } + }) # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: - return self.url_result(svt_url, 'SVT') + entries_for_batch.append(self.url_result(svt_url, 'SVT')) # Look for Bandcamp pages with custom domain mobj = re.search(r']*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: burl = unescapeHTML(mobj.group(1)) # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) + entries_for_batch.append(self.url_result(burl)) # Look for embedded Vevo player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) if mobj is not None: - return self.url_result(mobj.group('url')) + entries_for_batch.append(self.url_result(mobj.group('url'))) # Look for embedded Viddler player mobj = re.search( r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', webpage) if mobj is not None: - return self.url_result(mobj.group('url')) + entries_for_batch.append(self.url_result(mobj.group('url'))) # Look for NYTimes player mobj = re.search( r']+src=(["\'])(?P(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', webpage) if mobj is not None: - return self.url_result(mobj.group('url')) + entries_for_batch.append(self.url_result(mobj.group('url'))) # Look for Libsyn player mobj = re.search( r']+src=(["\'])(?P(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) if mobj is not None: - return self.url_result(mobj.group('url')) + entries_for_batch.append(self.url_result(mobj.group('url'))) # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or @@ -2194,19 +2197,19 @@ class GenericIE(InfoExtractor): if mobj is not None: embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: - return self.playlist_from_matches( + entries_for_batch.append(self.playlist_from_matches( embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') + getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')) # Look for Aparat videos mobj = re.search(r'