From 9dc8c6eb232df8f12bc1bf0ac4ea327503d450f9 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 14:39:40 -0400 Subject: [PATCH 01/12] added moviestorm InfoExtractor. This is a link farm handler that scrapes urls from a moviestorm page, which are then handed off to one of youtube-dl's other handlers for download/extraction --- youtube_dl/extractor/moviestorm.py | 128 +++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 youtube_dl/extractor/moviestorm.py diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py new file mode 100644 index 000000000..a4981850a --- /dev/null +++ b/youtube_dl/extractor/moviestorm.py @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os.path +import re +from time import sleep + +from .common import InfoExtractor +from ..utils import ExtractorError +from ..compat import ( + compat_html_parser, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, +) + +class MovieStormHTMLParser(compat_html_parser.HTMLParser): + def __init__(self): + self.found_button = False + self.watch_urls = [] + self.direct_url = False + compat_html_parser.HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + attrs = dict((k, v) for k, v in attrs) + if tag == 'td' and attrs['class'] == 'link_td': + self.found_button = True + elif tag == 'a' and self.found_button: + # suppress ishare and other direct links, can't handle now + if 'moviestorm' in attrs['href']: + self.watch_urls.append(attrs['href'].strip()) + elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link': + self.direct_url = attrs['href'].strip() + + def handle_endtag(self, tag): + if tag == 'td': + self.found_button = False + + @classmethod + def extract_watch_urls(cls, html): + p = cls() + p.feed(html) + p.close() + return p.watch_urls + + @classmethod + def extract_direct_url(cls, html): + p = cls() + p.feed(html) + p.close() + return p.direct_url + +class MovieStormIE(InfoExtractor): + IE_DESC = 'Movie Storm (link farm)' + IE_NAME = 'MovieStorm' + _VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)' + _LINK_FARM = True + + # There are no tests for this IE because the links on any given moviestorm + # page can dynamically change, and because the actual download/extraction + # is ultimately preformed by another IE. An example of an acceptable url to + # feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1 + _TEST = False + + # moviestorm's drupal db config is unstable at times + # retry up to 5 times before giving up, 5 second delay + # between each retry + retry_count = 0 + max_retries = 5 + retry_wait = 5 + direct_urls = [] + + def _parse_target(self, target): + uri = compat_urlparse.urlparse(target) + hash = uri.fragment[1:].split('?')[0] + token = os.path.basename(hash.rstrip('/')) + return (uri, hash, token) + + def _real_extract(self, url): + # retry loop to capture moviestorm page + while True: + if self.retry_count == 0: + note = 'Downloading link farm page' + else: + note = ('Unstable db connection, retying again in %s seconds ' + '[%s/%s]' % (self.retry_wait, self.retry_count, + self.max_retries)) + + (_, _, token) = self._parse_target(url) + farmpage = self._download_webpage( + url, token, + note=note, + errnote='Unable to download link farm page', + fatal=False + ) + + if farmpage.strip() != 'MySQL server has gone away': + break + + if self.retry_count < self.max_retries: + self.retry_count += 1 + sleep(self.retry_wait) + else: + msg = 'The moviestorm database is currently unstable. Please try again later.' + raise ExtractorError(msg, expected=True) + + # scrape WATCH button links from moviestorm page + self.to_screen(': Extracting watch page urls') + watch_urls = MovieStormHTMLParser.extract_watch_urls(farmpage) + + # get direct urls from scraped watch pages + self.to_screen(': Extracting direct links from watch pages') + for watch_url in watch_urls: + (_, _, token) = self._parse_target(watch_url) + watchpage = self._download_webpage( + watch_url, token, + note=False, + errnote='Unable to download link farm watch page', + fatal=False + ) + + if watchpage is not None: + direct_url = MovieStormHTMLParser.extract_direct_url(watchpage) + if direct_url: + self.direct_urls.append(direct_url) + + self.to_screen(': Passing off farmed links to InfoExtractors') + return list(set(self.direct_urls)) From 4c7a02aa47534d6a7b522bfcb30eb70fadfe8c1c Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 14:42:46 -0400 Subject: [PATCH 02/12] added moviestorm entry to extractor/__init__.py --- youtube_dl/extractor/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ad133603f..4e65570e0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -37,7 +37,6 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE -from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE @@ -232,7 +231,6 @@ from .jove import JoveIE from .jukebox import JukeboxIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE -from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .keezmovies import KeezMoviesIE @@ -281,6 +279,7 @@ from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE +from .moviestorm import MovieStormIE from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motherless import MotherlessIE @@ -559,7 +558,6 @@ from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE -from .viewster import ViewsterIE from .vimeo import ( VimeoIE, VimeoAlbumIE, From 8654a8484f3e22f26fd464811121459e88409f55 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 14:47:06 -0400 Subject: [PATCH 03/12] added process_farmed_links method to YoutubeDL class to handle moviestorm linkfarm IE. This can be used by other linkfarm IEs that I/we plan to write in the future. --- youtube_dl/YoutubeDL.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e779fc9a8..59e8114a4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -629,6 +629,28 @@ class YoutubeDL(object): for key, value in extra_info.items(): info_dict.setdefault(key, value) + def process_farmed_links(self, direct_urls): + familiar_farmed_urls = [] + for farmed_url in direct_urls: + for ie in self._ies: + # not all extractors have IE_NAME set, using class name for fuller coverage + c = ie.__class__.__name__ + + # ignore non-familiar links + if c != 'GenericIE' and c != 'MovieStormIE' and ie.suitable(farmed_url): + familiar_farmed_urls.append( [ie, farmed_url] ) + + for tuple in familiar_farmed_urls: + ie = tuple[0] + familiar_farmed_url = tuple[1] + + try: + ie_result = ie.extract(familiar_farmed_url) + return ie_result, ie + except: + # Failed extract, move on to next url in list + ie.to_screen("\033[0;33mWARNING:\033[0m failed attempt, trying next farmed link") + def extract_info(self, url, download=True, ie_key=None, extra_info={}, process=True): ''' @@ -652,6 +674,11 @@ class YoutubeDL(object): try: ie_result = ie.extract(url) + + # handle link farm extractors + if hasattr(ie, '_LINK_FARM') and ie._LINK_FARM: + ie_result, ie = self.process_farmed_links(ie_result) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) break if isinstance(ie_result, list): From 0dc602d9e35ab712c3be57e5a0b67ae2a36f2d28 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 14:53:41 -0400 Subject: [PATCH 04/12] replaced tabs with spaces in moviestorm IE --- youtube_dl/extractor/moviestorm.py | 46 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index a4981850a..5fceca6b1 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -56,13 +56,13 @@ class MovieStormIE(InfoExtractor): _VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)' _LINK_FARM = True - # There are no tests for this IE because the links on any given moviestorm - # page can dynamically change, and because the actual download/extraction - # is ultimately preformed by another IE. An example of an acceptable url to - # feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1 + # There are no tests for this IE because the links on any given moviestorm + # page can dynamically change, and because the actual download/extraction + # is ultimately preformed by another IE. An example of an acceptable url to + # feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1 _TEST = False - # moviestorm's drupal db config is unstable at times + # moviestorm's drupal db config is unstable at times # retry up to 5 times before giving up, 5 second delay # between each retry retry_count = 0 @@ -79,12 +79,12 @@ class MovieStormIE(InfoExtractor): def _real_extract(self, url): # retry loop to capture moviestorm page while True: - if self.retry_count == 0: + if self.retry_count == 0: note = 'Downloading link farm page' else: - note = ('Unstable db connection, retying again in %s seconds ' - '[%s/%s]' % (self.retry_wait, self.retry_count, - self.max_retries)) + note = ('Unstable db connection, retying again in %s seconds ' + '[%s/%s]' % (self.retry_wait, self.retry_count, + self.max_retries)) (_, _, token) = self._parse_target(url) farmpage = self._download_webpage( @@ -95,14 +95,14 @@ class MovieStormIE(InfoExtractor): ) if farmpage.strip() != 'MySQL server has gone away': - break + break if self.retry_count < self.max_retries: - self.retry_count += 1 - sleep(self.retry_wait) + self.retry_count += 1 + sleep(self.retry_wait) else: - msg = 'The moviestorm database is currently unstable. Please try again later.' - raise ExtractorError(msg, expected=True) + msg = 'The moviestorm database is currently unstable. Please try again later.' + raise ExtractorError(msg, expected=True) # scrape WATCH button links from moviestorm page self.to_screen(': Extracting watch page urls') @@ -111,18 +111,18 @@ class MovieStormIE(InfoExtractor): # get direct urls from scraped watch pages self.to_screen(': Extracting direct links from watch pages') for watch_url in watch_urls: - (_, _, token) = self._parse_target(watch_url) - watchpage = self._download_webpage( - watch_url, token, - note=False, - errnote='Unable to download link farm watch page', - fatal=False + (_, _, token) = self._parse_target(watch_url) + watchpage = self._download_webpage( + watch_url, token, + note=False, + errnote='Unable to download link farm watch page', + fatal=False ) if watchpage is not None: - direct_url = MovieStormHTMLParser.extract_direct_url(watchpage) - if direct_url: - self.direct_urls.append(direct_url) + direct_url = MovieStormHTMLParser.extract_direct_url(watchpage) + if direct_url: + self.direct_urls.append(direct_url) self.to_screen(': Passing off farmed links to InfoExtractors') return list(set(self.direct_urls)) From b0f5c78ffde4cf855e803ebb3ac9cbe131a9fa52 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 14:58:16 -0400 Subject: [PATCH 05/12] replaced tabs with spaces --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/extractor/moviestorm.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 59e8114a4..039bc49fb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -677,7 +677,7 @@ class YoutubeDL(object): # handle link farm extractors if hasattr(ie, '_LINK_FARM') and ie._LINK_FARM: - ie_result, ie = self.process_farmed_links(ie_result) + ie_result, ie = self.process_farmed_links(ie_result) if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) break diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index 5fceca6b1..5486fa10d 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -30,7 +30,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser): if 'moviestorm' in attrs['href']: self.watch_urls.append(attrs['href'].strip()) elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link': - self.direct_url = attrs['href'].strip() + self.direct_url = attrs['href'].strip() def handle_endtag(self, tag): if tag == 'td': @@ -80,11 +80,11 @@ class MovieStormIE(InfoExtractor): # retry loop to capture moviestorm page while True: if self.retry_count == 0: - note = 'Downloading link farm page' - else: - note = ('Unstable db connection, retying again in %s seconds ' - '[%s/%s]' % (self.retry_wait, self.retry_count, - self.max_retries)) + note = 'Downloading link farm page' + else: + note = ('Unstable db connection, retying again in %s seconds ' + '[%s/%s]' % (self.retry_wait, self.retry_count, + self.max_retries)) (_, _, token) = self._parse_target(url) farmpage = self._download_webpage( From 503a9f3f8e15d08fad4a0f14b63eff9744e90d22 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 15:01:05 -0400 Subject: [PATCH 06/12] replaced tabs with spaces --- youtube_dl/extractor/moviestorm.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index 5486fa10d..db6e66ad2 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -86,23 +86,23 @@ class MovieStormIE(InfoExtractor): '[%s/%s]' % (self.retry_wait, self.retry_count, self.max_retries)) - (_, _, token) = self._parse_target(url) - farmpage = self._download_webpage( - url, token, - note=note, - errnote='Unable to download link farm page', - fatal=False - ) + (_, _, token) = self._parse_target(url) + farmpage = self._download_webpage( + url, token, + note=note, + errnote='Unable to download link farm page', + fatal=False + ) - if farmpage.strip() != 'MySQL server has gone away': - break + if farmpage.strip() != 'MySQL server has gone away': + break - if self.retry_count < self.max_retries: - self.retry_count += 1 - sleep(self.retry_wait) - else: - msg = 'The moviestorm database is currently unstable. Please try again later.' - raise ExtractorError(msg, expected=True) + if self.retry_count < self.max_retries: + self.retry_count += 1 + sleep(self.retry_wait) + else: + msg = 'The moviestorm database is currently unstable. Please try again later.' + raise ExtractorError(msg, expected=True) # scrape WATCH button links from moviestorm page self.to_screen(': Extracting watch page urls') From b7a1a296fbda88aec066d84643b74ea245b3db12 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 15:03:49 -0400 Subject: [PATCH 07/12] replaced tabs with spaces --- youtube_dl/extractor/moviestorm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index db6e66ad2..f96957832 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -117,12 +117,12 @@ class MovieStormIE(InfoExtractor): note=False, errnote='Unable to download link farm watch page', fatal=False - ) + ) - if watchpage is not None: - direct_url = MovieStormHTMLParser.extract_direct_url(watchpage) - if direct_url: - self.direct_urls.append(direct_url) + if watchpage is not None: + direct_url = MovieStormHTMLParser.extract_direct_url(watchpage) + if direct_url: + self.direct_urls.append(direct_url) self.to_screen(': Passing off farmed links to InfoExtractors') return list(set(self.direct_urls)) From 3ca77367f9adc97bc88c791b51cae3d83e874d13 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 15:15:53 -0400 Subject: [PATCH 08/12] removed unnecesary imports from extract/moviestorm.py --- youtube_dl/extractor/moviestorm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index f96957832..d6ddd8e82 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -9,9 +9,7 @@ from .common import InfoExtractor from ..utils import ExtractorError from ..compat import ( compat_html_parser, - compat_urllib_parse, - compat_urllib_request, - compat_urlparse, + compat_urlparse ) class MovieStormHTMLParser(compat_html_parser.HTMLParser): From 5af03d5cfecdaf3adf24f3ad4d8660a8b0743278 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 18:34:27 -0400 Subject: [PATCH 09/12] adjusted url handling to pull in all moviestorm urls and provide informative error if not a handleable moviestorm url. This is to prevent youtube-dl from falling back on the generic IE for bad moviestorm urls, as that will always fail --- youtube_dl/extractor/moviestorm.py | 115 ++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index d6ddd8e82..01ab19faf 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -17,6 +17,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser): self.found_button = False self.watch_urls = [] self.direct_url = False + self.series_home_page = False compat_html_parser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): @@ -24,7 +25,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser): if tag == 'td' and attrs['class'] == 'link_td': self.found_button = True elif tag == 'a' and self.found_button: - # suppress ishare and other direct links, can't handle now + # Suppress ishare and other direct links, can't handle now if 'moviestorm' in attrs['href']: self.watch_urls.append(attrs['href'].strip()) elif tag == 'a' and 'class' in attrs and attrs['class'] == 'real_link': @@ -34,35 +35,47 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser): if tag == 'td': self.found_button = False - @classmethod - def extract_watch_urls(cls, html): - p = cls() - p.feed(html) - p.close() - return p.watch_urls + def handle_data(self, data): + if data.strip() == 'SHOW EPISODES': + self.series_home_page = True @classmethod - def extract_direct_url(cls, html): + def custom_parse(cls, html, return_variable): p = cls() p.feed(html) p.close() - return p.direct_url + return getattr(p, return_variable) class MovieStormIE(InfoExtractor): - IE_DESC = 'Movie Storm (link farm)' - IE_NAME = 'MovieStorm' - _VALID_URL = r'http://moviestorm\.eu/view/(\d+)-watch-(.*)/season-(\d+)/episode-(\d+)' - _LINK_FARM = True - + # HANDLER INFO: # There are no tests for this IE because the links on any given moviestorm # page can dynamically change, and because the actual download/extraction - # is ultimately preformed by another IE. An example of an acceptable url to - # feed to this IE is: http://moviestorm.eu/view/218-watch-the-simpsons/season-26/episode-1 + # is ultimately preformed by another IE. Example urls to + # feed to this IE are: + # + # EPISODE: http://moviestorm.eu/view/5821-watch-portlandia/season-1/episode-1 + # MOVIE: http://moviestorm.eu/view/5269-watch-taken-3-online.html + # + # If the user provides a series url, like the one below, this IE should detect + # and raise an error: + # + # SERIES: http://moviestorm.eu/view/5821-watch-portlandia.html + # + # In other news, moviestorm's drupal db config is unstable at times retry up to 5 + # times before giving up, waiting 5 second delay between each retry. + # + # Also, this IE will catch all links with http://moviestorm.eu urls. If it's an + # un-handleable url, an error will be thrown informing the user of appropriate + # urls to provide. Not using a more complex regex is meant to prevent unacceptable + # moviestorm urls from falling back into the generic IE, as that will always fail on + # moviestorm links. + + IE_DESC = 'Movie Storm (link farm)' + IE_NAME = 'MovieStorm' + _VALID_URL = r'http://moviestorm\.eu' + _LINK_FARM = True _TEST = False - # moviestorm's drupal db config is unstable at times - # retry up to 5 times before giving up, 5 second delay - # between each retry retry_count = 0 max_retries = 5 retry_wait = 5 @@ -75,7 +88,12 @@ class MovieStormIE(InfoExtractor): return (uri, hash, token) def _real_extract(self, url): - # retry loop to capture moviestorm page + # Inform user to provide proper moviestorm link + if 'watch' not in url: + msg = ('The moviestorm handler requires either a movie page link or ' + 'a series episode page link. Please try again with one of those.') + raise ExtractorError(msg, expected=True) + while True: if self.retry_count == 0: note = 'Downloading link farm page' @@ -93,8 +111,21 @@ class MovieStormIE(InfoExtractor): ) if farmpage.strip() != 'MySQL server has gone away': + series_home_page = MovieStormHTMLParser.custom_parse( + farmpage, + 'series_home_page' + ) + + # Fail if provided series home page + if series_home_page: + msg = ('It looks like you provided an show page url. You must provide ' + 'an episode page url or movie page url') + raise ExtractorError(msg, expected=True) + + # Success break + # Continue retrying if moviestorm database is currently unstable if self.retry_count < self.max_retries: self.retry_count += 1 sleep(self.retry_wait) @@ -102,25 +133,39 @@ class MovieStormIE(InfoExtractor): msg = 'The moviestorm database is currently unstable. Please try again later.' raise ExtractorError(msg, expected=True) - # scrape WATCH button links from moviestorm page + # Scrape WATCH button links from moviestorm page self.to_screen(': Extracting watch page urls') - watch_urls = MovieStormHTMLParser.extract_watch_urls(farmpage) + watch_urls = MovieStormHTMLParser.custom_parse( + farmpage, + 'watch_urls' + ) - # get direct urls from scraped watch pages + # Get direct urls from scraped watch pages self.to_screen(': Extracting direct links from watch pages') - for watch_url in watch_urls: - (_, _, token) = self._parse_target(watch_url) - watchpage = self._download_webpage( - watch_url, token, - note=False, - errnote='Unable to download link farm watch page', - fatal=False - ) + direct_url_count = 1 - if watchpage is not None: - direct_url = MovieStormHTMLParser.extract_direct_url(watchpage) - if direct_url: - self.direct_urls.append(direct_url) + for watch_url in watch_urls: + # Stop after gathering 50 urls, moviestorm sends 503 if + # request too many in rapid succession + if direct_url_count < 50: + (_, _, token) = self._parse_target(watch_url) + watchpage = self._download_webpage( + watch_url, token, + note=False, + errnote='Unable to download link farm watch page', + fatal=False + ) + + if watchpage is not None: + direct_url = MovieStormHTMLParser.custom_parse( + watchpage, + 'direct_url' + ) + + if direct_url: + self.direct_urls.append(direct_url) + + direct_url_count += 1 self.to_screen(': Passing off farmed links to InfoExtractors') return list(set(self.direct_urls)) From 0ede245462471186b8b47be4286c265d8a2304da Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 18:42:30 -0400 Subject: [PATCH 10/12] updated moviestorm extractor to use proper doc strins --- youtube_dl/extractor/moviestorm.py | 45 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index 01ab19faf..de02ab8fd 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -47,28 +47,29 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser): return getattr(p, return_variable) class MovieStormIE(InfoExtractor): - # HANDLER INFO: - # There are no tests for this IE because the links on any given moviestorm - # page can dynamically change, and because the actual download/extraction - # is ultimately preformed by another IE. Example urls to - # feed to this IE are: - # - # EPISODE: http://moviestorm.eu/view/5821-watch-portlandia/season-1/episode-1 - # MOVIE: http://moviestorm.eu/view/5269-watch-taken-3-online.html - # - # If the user provides a series url, like the one below, this IE should detect - # and raise an error: - # - # SERIES: http://moviestorm.eu/view/5821-watch-portlandia.html - # - # In other news, moviestorm's drupal db config is unstable at times retry up to 5 - # times before giving up, waiting 5 second delay between each retry. - # - # Also, this IE will catch all links with http://moviestorm.eu urls. If it's an - # un-handleable url, an error will be thrown informing the user of appropriate - # urls to provide. Not using a more complex regex is meant to prevent unacceptable - # moviestorm urls from falling back into the generic IE, as that will always fail on - # moviestorm links. + """EXTRACTOR INFO: + There are no tests for this IE because the links on any given moviestorm + page can dynamically change, and because the actual download/extraction + is ultimately preformed by another IE. Example urls to + feed to this IE are: + + EPISODE: http://moviestorm.eu/view/5821-watch-portlandia/season-1/episode-1 + MOVIE: http://moviestorm.eu/view/5269-watch-taken-3-online.html + + If the user provides a series url, like the one below, this IE should detect + and raise an error: + + SERIES: http://moviestorm.eu/view/5821-watch-portlandia.html + + In other news, moviestorm's drupal db config is unstable at times retry up to 5 + times before giving up, waiting 5 second delay between each retry. + + Also, this IE will catch all links with http://moviestorm.eu urls. If it's an + un-handleable url, an error will be thrown informing the user of appropriate + urls to provide. Not using a more complex regex is meant to prevent unacceptable + moviestorm urls from falling back into the generic IE, as that will always fail on + moviestorm links. + """ IE_DESC = 'Movie Storm (link farm)' IE_NAME = 'MovieStorm' From 9fcc22c0f2a50f47cf3bc5684721310570884fdb Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 21:26:11 -0400 Subject: [PATCH 11/12] fixed styling issues that flake8 didn't like --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/extractor/moviestorm.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 039bc49fb..b47b74733 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -638,7 +638,7 @@ class YoutubeDL(object): # ignore non-familiar links if c != 'GenericIE' and c != 'MovieStormIE' and ie.suitable(farmed_url): - familiar_farmed_urls.append( [ie, farmed_url] ) + familiar_farmed_urls.append([ie, farmed_url]) for tuple in familiar_farmed_urls: ie = tuple[0] diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index de02ab8fd..00cce76ed 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import os.path -import re from time import sleep from .common import InfoExtractor @@ -12,6 +11,7 @@ from ..compat import ( compat_urlparse ) + class MovieStormHTMLParser(compat_html_parser.HTMLParser): def __init__(self): self.found_button = False @@ -46,6 +46,7 @@ class MovieStormHTMLParser(compat_html_parser.HTMLParser): p.close() return getattr(p, return_variable) + class MovieStormIE(InfoExtractor): """EXTRACTOR INFO: There are no tests for this IE because the links on any given moviestorm @@ -92,7 +93,7 @@ class MovieStormIE(InfoExtractor): # Inform user to provide proper moviestorm link if 'watch' not in url: msg = ('The moviestorm handler requires either a movie page link or ' - 'a series episode page link. Please try again with one of those.') + 'a series episode page link. Please try again with one of those.') raise ExtractorError(msg, expected=True) while True: @@ -100,8 +101,8 @@ class MovieStormIE(InfoExtractor): note = 'Downloading link farm page' else: note = ('Unstable db connection, retying again in %s seconds ' - '[%s/%s]' % (self.retry_wait, self.retry_count, - self.max_retries)) + '[%s/%s]' % (self.retry_wait, self.retry_count, + self.max_retries)) (_, _, token) = self._parse_target(url) farmpage = self._download_webpage( @@ -120,7 +121,7 @@ class MovieStormIE(InfoExtractor): # Fail if provided series home page if series_home_page: msg = ('It looks like you provided an show page url. You must provide ' - 'an episode page url or movie page url') + 'an episode page url or movie page url') raise ExtractorError(msg, expected=True) # Success From 8f664d93690b0cc11b342c5f67c3ffc315fafa02 Mon Sep 17 00:00:00 2001 From: Philip Ardery Date: Sat, 14 Mar 2015 22:54:13 -0400 Subject: [PATCH 12/12] added variable for max direct urls scrapped --- youtube_dl/extractor/moviestorm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/moviestorm.py b/youtube_dl/extractor/moviestorm.py index 00cce76ed..b492449da 100644 --- a/youtube_dl/extractor/moviestorm.py +++ b/youtube_dl/extractor/moviestorm.py @@ -82,6 +82,7 @@ class MovieStormIE(InfoExtractor): max_retries = 5 retry_wait = 5 direct_urls = [] + direct_url_max = 50 def _parse_target(self, target): uri = compat_urlparse.urlparse(target) @@ -149,7 +150,7 @@ class MovieStormIE(InfoExtractor): for watch_url in watch_urls: # Stop after gathering 50 urls, moviestorm sends 503 if # request too many in rapid succession - if direct_url_count < 50: + if direct_url_count < self.direct_url_max: (_, _, token) = self._parse_target(watch_url) watchpage = self._download_webpage( watch_url, token,