From ba2623208cb0694f7e9cd6dc1c37b8ee53b3cb3c Mon Sep 17 00:00:00 2001 From: bato3 Date: Mon, 1 Apr 2019 17:38:53 +0200 Subject: [PATCH 1/3] Detect the `cloudflare challenge` and if `cfscrape` is available, try to solve it. --- youtube_dl/extractor/common.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0889288f0..e51ad71a5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -52,6 +52,7 @@ from ..utils import ( float_or_none, GeoRestrictedError, GeoUtils, + HEADRequest, int_or_none, js_to_json, JSON_LD_RE, @@ -66,6 +67,7 @@ from ..utils import ( RegexNotFoundError, sanitized_Request, sanitize_filename, + std_headers, str_or_none, unescapeHTML, unified_strdate, @@ -79,6 +81,11 @@ from ..utils import ( xpath_text, xpath_with_ns, ) +try: + import cfscrape + cfscrape_available = True +except ImportError: + cfscrape_available = False class InfoExtractor(object): @@ -625,6 +632,26 @@ class InfoExtractor(object): try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if isinstance(err, compat_urllib_error.HTTPError) and not isinstance(url_or_request, HEADRequest): + if err.code == 503 and err.headers.get('Server').startswith('cloudflare'): + if not cfscrape_available: + raise ExtractorError('Cloudflare challenge found. Provide cookies or install cfscrape.', expected=True) + else: + self.to_screen('Solving Cloudflare challenge (~7s)') + scraper = cfscrape.create_scraper() + cookies = dict((cookie.name, cookie.value) for cookie in self._downloader.cookiejar) + try: + tokens = scraper.get_tokens(err.geturl(), std_headers['User-Agent'], cookies=cookies) + except ValueError as e: + raise ExtractorError('cfscrape error: %s' % e, expected=True) + cookie = url_or_request.get_header('Cookie') + cookie += '; cf_clearance=' + tokens[0]['cf_clearance'] + url_or_request = update_Request(url_or_request, headers={'Cookie': cookie}) + self.to_screen('Redownload webpage') + try: + return self._downloader.urlopen(url_or_request) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as new_err: + err = new_err if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): # Retain reference to error to prevent file object from From af778e165f7db629f2d6014f4c319729493220a6 Mon Sep 17 00:00:00 2001 From: bato3 Date: Mon, 1 Apr 2019 23:45:05 +0200 Subject: [PATCH 2/3] when url_or_request isn't request Cloudscrape has problem with urls: `http://example.com//double-slash` --- youtube_dl/extractor/cda.py | 2 +- youtube_dl/extractor/common.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 0c3af23d5..22bc48eb8 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -18,7 +18,7 @@ from ..utils import ( class CDAIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' - _BASE_URL = 'http://www.cda.pl/' + _BASE_URL = 'https://www.cda.pl' _TESTS = [{ 'url': 'http://www.cda.pl/video/5749950c', 'md5': '6f844bf51b15f31fae165365707ae970', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e51ad71a5..392cd4873 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -644,9 +644,15 @@ class InfoExtractor(object): tokens = scraper.get_tokens(err.geturl(), std_headers['User-Agent'], cookies=cookies) except ValueError as e: raise ExtractorError('cfscrape error: %s' % e, expected=True) - cookie = url_or_request.get_header('Cookie') - cookie += '; cf_clearance=' + tokens[0]['cf_clearance'] - url_or_request = update_Request(url_or_request, headers={'Cookie': cookie}) + + cookie = 'cf_clearance=' + tokens[0]['cf_clearance'] + for c in self._downloader.cookiejar: + cookie += '; %s=%s' % (c.name, c.value) + if not isinstance(url_or_request, compat_urllib_request.Request): + self._set_cookie(compat_urlparse.urlparse(err.geturl()).netloc, 'cf_clearance', tokens[0]['cf_clearance']) + url_or_request = sanitized_Request(url_or_request, data, {'Cookie': cookie}) + else: + url_or_request = update_Request(url_or_request, headers={'Cookie': cookie}) self.to_screen('Redownload webpage') try: return self._downloader.urlopen(url_or_request) From 9880b2da45a1c3daf8881626b00536636005d9bc Mon Sep 17 00:00:00 2001 From: bato3 Date: Tue, 2 Apr 2019 01:02:24 +0200 Subject: [PATCH 3/3] `__cfduid` can't be stored to long in cookieJar --- youtube_dl/extractor/common.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 392cd4873..9535078f1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -639,17 +639,19 @@ class InfoExtractor(object): else: self.to_screen('Solving Cloudflare challenge (~7s)') scraper = cfscrape.create_scraper() - cookies = dict((cookie.name, cookie.value) for cookie in self._downloader.cookiejar) try: - tokens = scraper.get_tokens(err.geturl(), std_headers['User-Agent'], cookies=cookies) + tokens = scraper.get_tokens(err.geturl(), std_headers['User-Agent']) except ValueError as e: raise ExtractorError('cfscrape error: %s' % e, expected=True) - cookie = 'cf_clearance=' + tokens[0]['cf_clearance'] + cookie = 'cf_clearance=' + tokens[0]['cf_clearance'] + '; __cfduid=' + tokens[0]['__cfduid'] for c in self._downloader.cookiejar: - cookie += '; %s=%s' % (c.name, c.value) + if c.name != '__cfduid' and c.name != 'cf_clearance': + cookie += '; %s=%s' % (c.name, c.value) + domain = '.' + compat_urlparse.urlparse(err.geturl()).netloc.replace('www.', '') + self._set_cookie(domain, 'cf_clearance', tokens[0]['cf_clearance']) + self._set_cookie(domain, '__cfduid', tokens[0]['__cfduid']) if not isinstance(url_or_request, compat_urllib_request.Request): - self._set_cookie(compat_urlparse.urlparse(err.geturl()).netloc, 'cf_clearance', tokens[0]['cf_clearance']) url_or_request = sanitized_Request(url_or_request, data, {'Cookie': cookie}) else: url_or_request = update_Request(url_or_request, headers={'Cookie': cookie})