From 1e77c43688fa2d4c3a8c1889c9ab22a3eb1c96fb Mon Sep 17 00:00:00 2001 From: bato3 Date: Tue, 31 Jul 2018 12:53:39 +0200 Subject: [PATCH] Login when is present cloudflare challenge --- youtube_dl/extractor/common.py | 58 +++++++++++++++++++++++++++++ youtube_dl/extractor/crunchyroll.py | 41 ++++++-------------- youtube_dl/extractor/openload.py | 4 +- 3 files changed, 71 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b8bbaf81a..61ca7275c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2818,6 +2818,64 @@ class InfoExtractor(object): def _generic_title(self, url): return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + def _cf_solve_challenge(self, body, domain): + ''' + Solve CloudFlrae Callenge. + @param domain result `ompat_urlparse.urlparse().netloc` + Oryginal code from :https://github.com/Anorov/cloudflare-scrape/blob/master/cfscrape/__init__.py#L112-L149 + ''' + try: + js = re.search(r"setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1) + except Exception: + raise ValueError("Unable to identify Cloudflare IUAM Javascript on website.") + + js = re.sub(r"a\.value = (.+ \+ t\.length).+", r"\1", js) + js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain))) + + # Strip characters that could be used to exit the string context + # These characters are not currently used in Cloudflare's arithmetic snippet + js = re.sub(r"[\n\\']", "", js) + + if "toFixed" not in js: + raise ValueError("Error parsing Cloudflare IUAM Javascript challenge.") + + # Use vm.runInNewContext to safely evaluate code + # The sandboxed code cannot use the Node.js standard library + js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js + + import subprocess + try: + result = subprocess.check_output(["node", "-e", js]).strip() + except OSError as e: + if e.errno == 2: + raise EnvironmentError("Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.") + raise + except Exception: + self.to_screen("Error executing Cloudflare IUAM Javascript.") + raise + + try: + float(result) + except Exception: + raise ValueError("Cloudflare IUAM challenge returned unexpected answer.") + + return result + + def cf_solve_and_download_webpage(self, html, download_url): + if '/cdn-cgi/l/chk_jschl' not in html: + return False + parsed_url = compat_urlparse.urlparse(download_url) + domain = parsed_url.netloc + submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) + form_data = self._form_hidden_inputs('challenge-form', html) + form_data['jschl_answer'] = self._cf_solve_challenge(html, domain) + + self._sleep(5, None, 'Solving Cloudflare challenge (5s)') + return self._download_webpage( + submit_url, + None, 'Sending Cloudflare challenge', 'Wrong Cloudflare challenge', query=form_data + ) + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index fc1210079..51339abb7 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -36,6 +36,7 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.crunchyroll.com/login' _LOGIN_FORM = 'login_form' + _PROFILE_URL = 'https://www.crunchyroll.com/acct/membership' _NETRC_MACHINE = 'crunchyroll' def _call_rpc_api(self, method, video_id, note=None, data=None): @@ -52,25 +53,17 @@ class CrunchyrollBaseIE(InfoExtractor): username, password = self._get_login_info() if username is None: return - ''' - import cfscrape - proxies = {"http": self._downloader.params.get('proxy'), "https": self._downloader.params.get('proxy')} - tokens, user_agent = cfscrape.get_tokens(self._LOGIN_URL, proxies=proxies, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0") - - self._set_cookie( '.crunchyroll.com', 'cf_clearance',tokens['cf_clearance']) - self._set_cookie( '.crunchyroll.com', '__cfduid',tokens['__cfduid']) - ''' login_page = self._download_webpage( 'https://www.crunchyroll.com/?a=formhandler', None, 'Logging in', 'Wrong login info', data=urlencode_postdata({ 'formname': 'RpcApiUser_Login', - 'next_url': 'https://www.crunchyroll.com/acct/membership', - 'fail_url': self._LOGIN_URL, + 'next_url': self._PROFILE_URL, + 'fail_url': self._PROFILE_URL, # On login fail redirect to login page 'name': username, 'password': password, - }), expected_status=503) + }), expected_status=503) # 503 for CloudFlare def is_logged(webpage): return 'Redirecting' in webpage or '/logout' in webpage @@ -79,23 +72,13 @@ class CrunchyrollBaseIE(InfoExtractor): if is_logged(login_page): return - - ''' - print [tokens, user_agent] - - - form_data = self._form_hidden_inputs('challenge-form', login_page) - form_data['jschl_answer'] = self.solve_challenge(login_page, 'www.crunchyroll.com') - print form_data - self._sleep(6, None, 'Solving CloudFlare Challenge') - login_page = self._download_webpage('https://www.crunchyroll.com/cdn-cgi/l/chk_jschl', None, 'Login Form', data=urlencode_postdata(form_data), headers={ - 'Referer': self._LOGIN_URL, - }, expected_status= 503) - - import codecs - with codecs.open("yop", "w", encoding="utf-8") as f: - f.write(login_page) - ''' + cf_page = self.cf_solve_and_download_webpage(login_page, self._LOGIN_URL) + if cf_page: + login_page = cf_page + if is_logged(cf_page): + login_page = self._download_webpage(self._PROFILE_URL, None, 'Get new CSRF Token') + if is_logged(login_page): + return login_form_str = self._search_regex( r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, @@ -130,7 +113,6 @@ class CrunchyrollBaseIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): self._login() @@ -146,7 +128,6 @@ class CrunchyrollBaseIE(InfoExtractor): # Crunchyroll to not work in georestriction cases in some browsers that don't place # the locale lang first in header. However allowing any language seems to workaround the issue. request.add_header('Accept-Language', '*') - request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0') return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) @staticmethod diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d264fe206..58360f3da 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -164,7 +164,7 @@ class PhantomJSwrapper(object): cookie['expire_time'] = cookie['expiry'] self.extractor._set_cookie(**compat_kwargs(cookie)) - def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();', expected_status=None): """ Downloads webpage (if needed) and executes JS @@ -203,7 +203,7 @@ class PhantomJSwrapper(object): if 'saveAndExit();' not in jscode: raise ExtractorError('`saveAndExit();` not found in `jscode`') if not html: - html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) + html = self.extractor._download_webpage(url, video_id, note=note, headers=headers, expected_status=expected_status) with open(self._TMP_FILES['html'].name, 'wb') as f: f.write(html.encode('utf-8'))