From 18cbaa3f97e734d725836ad9f1604c80ead1b219 Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 10:03:56 +0800 Subject: [PATCH 1/9] bandwidth throttling detection and avoidance --- youtube_dl/__init__.py | 1 + youtube_dl/downloader/http.py | 111 +++++++++++++++++++++++++++++++++- youtube_dl/options.py | 5 ++ 3 files changed, 114 insertions(+), 3 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6850d95e1..03fe3baa2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -325,6 +325,7 @@ def _real_main(argv=None): 'ignoreerrors': opts.ignoreerrors, 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, + 'avoid_throttling': opts.avoid_throttling, 'nooverwrites': opts.nooverwrites, 'retries': opts.retries, 'fragment_retries': opts.fragment_retries, diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index af405b950..7b687ad59 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -18,8 +18,37 @@ from ..utils import ( XAttrUnavailableError, ) - class HttpFD(FileDownloader): + def report_will_throttle(self): + self.report_warning(("\r[download] This website does not support Content-Range header, " + "bandwidth throttling, if present, will not be avoided.")) + + def speed_up(self, data, request, peak_rate, block_rate, byte_counter, threshold): + # If current block rate is less than threshold, make a new request with new range + # header. Simply chaning the range header of an already existing request and + # does not always work and may start giving + # HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop. + # errors after serveral reconnections on some websites (e.g. vk.com is fine with reusing the same + # request, but pornhub.com is not) + if block_rate < peak_rate * threshold: + if self.params.get('verbose', False): + self.to_screen(("\n[throttling] Bandwidth throttling detected, making a new request. " + "(block rate = %.3f, peak rate = %.3f, threshold = %.2f") % (block_rate, peak_rate, threshold)) + request = sanitized_Request(request.full_url, None, request.headers) + request.add_header('Range', 'bytes=%d-' % byte_counter) + try: + new_data = self.ydl.urlopen(request) + except Exception as e: + self.report_warning("\r[download] Error when making a new request to avoid throttling, keeping previous connection and disabling this feature.") + self.report_warning("\r[download] %s" % e) + self.avoid_throttling = False + new_data = data + else: + data.close() # just to be safe + else: + new_data = data + return new_data + def real_download(self, filename, info_dict): url = info_dict['url'] tmpfilename = self.temp_name(filename) @@ -32,6 +61,8 @@ class HttpFD(FileDownloader): headers.update(add_headers) basic_request = sanitized_Request(url, None, headers) request = sanitized_Request(url, None, headers) + range_request = sanitized_Request(url, None, headers) + range_request.add_header('Range', 'bytes=10-20') is_test = self.params.get('test', False) @@ -55,7 +86,31 @@ class HttpFD(FileDownloader): count = 0 retries = self.params.get('retries', 0) + self.avoid_throttling = self.params.get('avoid_throttling', False) while count <= retries: + # Verify Content-Range header is accepted and honored. + if self.avoid_throttling: + try: + data = self.ydl.urlopen(range_request) + content_range = data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-', content_range) + test_range = re.search(r'bytes=(\d+)-', range_request.get_header('Range')) + if not content_range_m or test_range.group(1) != content_range_m.group(1): + self.avoid_throttling = False + except(compat_urllib_error.HTTPError, ) as err: + if err.code == 416: + self.avoid_throttling = False + elif (err.code < 500 or err.code >= 600): + # Unexpected HTTP error + raise + if not self.avoid_throttling: + self.report_will_throttle() + if resume_len > 0: + self.report_unable_to_resume() + resume_len = 0 + open_mode = 'wb' + # Establish connection try: data = self.ydl.urlopen(request) @@ -64,7 +119,8 @@ class HttpFD(FileDownloader): # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if resume_len > 0: + # This check is only done if throttling avoidance has not been requested. + if resume_len > 0 and not self.avoid_throttling: content_range = data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) @@ -154,14 +210,20 @@ class HttpFD(FileDownloader): byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) + # 4Mb is too much in case of bandwith throttling (takes ages to detect) + block_size_limit = 512 * 1024 start = time.time() # measure time over whole while-loop, so slow_down() and best_block_size() work together properly now = None # needed for slow_down() in the first loop run before = start # start measuring + peak_rate = 0 + throttling_start = None + throttling_threshold = None + throttling_size = 0 while True: - # Download and write + block_start = time.time() data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) byte_counter += len(data_block) @@ -169,6 +231,8 @@ class HttpFD(FileDownloader): if len(data_block) == 0: break + block_rate = block_size / (time.time() - block_start) + # Open destination file just in time if stream is None: try: @@ -203,6 +267,8 @@ class HttpFD(FileDownloader): # Adjust block size if not self.params.get('noresizebuffer', False): block_size = self.best_block_size(after - before, len(data_block)) + if self.avoid_throttling: + block_size = min(block_size, block_size_limit) before = after @@ -212,6 +278,45 @@ class HttpFD(FileDownloader): eta = None else: eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + + if speed and speed > peak_rate and time.time() - start > 1: + peak_rate = speed + + # Initial throttling detection mechanism. + # After data rate has dropped significantly starts calculating new + # rate and after a few seconds determines the restart + # threshold and max block size to catch subsequent throttles in a reasonable + # amount of time (around a second) + # threshold is set to twice the throttled data rate + # max block size is set to the power of two closest to the throttled data rate + if self.avoid_throttling and not throttling_threshold and peak_rate and block_rate <= peak_rate * 0.7: + throttling_size += block_size + if self.params.get('verbose', False): + self.to_screen(("\n[throttling] Throttling started or is continuing, block rate = %.3f, " + "peak rate = %.3f") % (block_rate, peak_rate)) + if not throttling_start: + throttling_start = block_start + if time.time() - throttling_start >= 3: + throttling_rate = throttling_size / (time.time() - throttling_start) + if throttling_rate > peak_rate * 0.7: + if self.params.get('verbose', False): + self.to_screen(("[throttling] Wasn't a throttle, temporary network hiccup " + "(current rate = %.3f, peak rate = %.3f.") % (throttling_rate, peak_rate)) + throttling_start = None + throttling_size = 0 + power = 0 + while int(throttling_rate + throttling_rate / 2) >> power != 1: + power += 1 + block_size_limit = 1 << power + throttling_threshold = min(5 * throttling_rate / peak_rate, 0.5) + if self.params.get('verbose', False): + self.to_screen(("[throttling] Throttling detected! peak rate = %.3f, current rate = %.3f, " + "setting threshold to %.2f and block size limit to %dKb") % (peak_rate, + throttling_rate, throttling_threshold, block_size_limit / 1024)) + + # We need max speed! + if self.avoid_throttling and throttling_threshold and byte_counter != data_len: + data = self.speed_up(data, request, peak_rate, block_rate, byte_counter, throttling_threshold) self._hook_progress({ 'status': 'downloading', diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 53497fbc6..04f2f6615 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -434,6 +434,11 @@ def parseOpts(overrideArguments=None): '-r', '--limit-rate', '--rate-limit', dest='ratelimit', metavar='RATE', help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') + downloader.add_option( + '--avoid-throttling', + action="store_true", dest='avoid_throttling', + help='Make a new request when bandwidth throttling is detected. Content-Range header must be supported', + default=False) downloader.add_option( '-R', '--retries', dest='retries', metavar='RETRIES', default=10, From 36d9f5f04b4f411727b9020e46d917806ce9ed99 Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 14:09:51 +0800 Subject: [PATCH 2/9] threshold strategy changed, range verification enhanced, debug lines improved --- youtube_dl/downloader/http.py | 44 ++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 7b687ad59..649981ea0 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -32,10 +32,13 @@ class HttpFD(FileDownloader): # request, but pornhub.com is not) if block_rate < peak_rate * threshold: if self.params.get('verbose', False): + last_range = request.headers.get('Range') + last_range_start = last_range and int(re.search(r'bytes=(\d+)-', last_range).group(1)) or 0 self.to_screen(("\n[throttling] Bandwidth throttling detected, making a new request. " - "(block rate = %.3f, peak rate = %.3f, threshold = %.2f") % (block_rate, peak_rate, threshold)) - request = sanitized_Request(request.full_url, None, request.headers) + "(block rate = %.2fKiB/s, downloaded %.0fKiB before throttling)") % ( + block_rate / 1024, (byte_counter - last_range_start) / 1024)) request.add_header('Range', 'bytes=%d-' % byte_counter) + request = sanitized_Request(request.full_url, None, request.headers) try: new_data = self.ydl.urlopen(request) except Exception as e: @@ -95,8 +98,11 @@ class HttpFD(FileDownloader): content_range = data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) - test_range = re.search(r'bytes=(\d+)-', range_request.get_header('Range')) - if not content_range_m or test_range.group(1) != content_range_m.group(1): + test_range_m = re.search(r'bytes=(\d+)-(\d+)', range_request.get_header('Range')) + test_length = str(int(test_range_m.group(2)) - int(test_range_m.group(1)) + 1) + content_length = data.info()['Content-Length'] + if (not content_range_m or test_range_m.group(1) != content_range_m.group(1) + or test_length != content_length): self.avoid_throttling = False except(compat_urllib_error.HTTPError, ) as err: if err.code == 416: @@ -226,13 +232,12 @@ class HttpFD(FileDownloader): block_start = time.time() data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) byte_counter += len(data_block) + block_rate = block_size / (time.time() - block_start) # exit loop when download is finished if len(data_block) == 0: break - block_rate = block_size / (time.time() - block_start) - # Open destination file just in time if stream is None: try: @@ -292,10 +297,10 @@ class HttpFD(FileDownloader): if self.avoid_throttling and not throttling_threshold and peak_rate and block_rate <= peak_rate * 0.7: throttling_size += block_size if self.params.get('verbose', False): - self.to_screen(("\n[throttling] Throttling started or is continuing, block rate = %.3f, " - "peak rate = %.3f") % (block_rate, peak_rate)) + self.to_screen(("\n[throttling] Throttling started or is continuing, block rate = %.2fKiB/s, " + "peak rate = %.2fKiB/s") % (block_rate / 1024, peak_rate / 1024)) if not throttling_start: - throttling_start = block_start + throttling_start = block_start if time.time() - throttling_start >= 3: throttling_rate = throttling_size / (time.time() - throttling_start) if throttling_rate > peak_rate * 0.7: @@ -304,18 +309,19 @@ class HttpFD(FileDownloader): "(current rate = %.3f, peak rate = %.3f.") % (throttling_rate, peak_rate)) throttling_start = None throttling_size = 0 - power = 0 - while int(throttling_rate + throttling_rate / 2) >> power != 1: - power += 1 - block_size_limit = 1 << power - throttling_threshold = min(5 * throttling_rate / peak_rate, 0.5) - if self.params.get('verbose', False): - self.to_screen(("[throttling] Throttling detected! peak rate = %.3f, current rate = %.3f, " - "setting threshold to %.2f and block size limit to %dKb") % (peak_rate, - throttling_rate, throttling_threshold, block_size_limit / 1024)) + else: + block_size_limit = 1 + while block_size_limit < int(throttling_rate / 1.5): + block_size_limit *= 2 + throttling_threshold = (throttling_rate + (peak_rate - throttling_rate) / 4) / peak_rate + throttling_threshold = min(throttling_threshold, 0.7) + if self.params.get('verbose', False): + self.to_screen(("[throttling] Throttling detected! peak rate = %.2fKiB/s, current rate = %.2fKiB/s, " + "setting threshold to %.2f and block size limit to %dKiB") % (peak_rate / 1024, + throttling_rate / 1024, throttling_threshold, block_size_limit / 1024), True) # We need max speed! - if self.avoid_throttling and throttling_threshold and byte_counter != data_len: + if self.avoid_throttling and throttling_threshold and peak_rate and byte_counter != data_len: data = self.speed_up(data, request, peak_rate, block_rate, byte_counter, throttling_threshold) self._hook_progress({ From 82960baa8eb744573529f328fb5cd888f375022c Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 15:32:37 +0800 Subject: [PATCH 3/9] false positive rejection fix --- youtube_dl/downloader/http.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 649981ea0..d6096b792 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -35,8 +35,8 @@ class HttpFD(FileDownloader): last_range = request.headers.get('Range') last_range_start = last_range and int(re.search(r'bytes=(\d+)-', last_range).group(1)) or 0 self.to_screen(("\n[throttling] Bandwidth throttling detected, making a new request. " - "(block rate = %.2fKiB/s, downloaded %.0fKiB before throttling)") % ( - block_rate / 1024, (byte_counter - last_range_start) / 1024)) + "(peak rate = %.2fKiB/s, block rate = %.2fKiB/s, downloaded %.0fKiB before throttling)") % ( + peak_rate / 1024, block_rate / 1024, (byte_counter - last_range_start) / 1024)) request.add_header('Range', 'bytes=%d-' % byte_counter) request = sanitized_Request(request.full_url, None, request.headers) try: @@ -226,7 +226,7 @@ class HttpFD(FileDownloader): peak_rate = 0 throttling_start = None throttling_threshold = None - throttling_size = 0 + throttling_start_size = 0 while True: # Download and write block_start = time.time() @@ -284,7 +284,7 @@ class HttpFD(FileDownloader): else: eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) - if speed and speed > peak_rate and time.time() - start > 1: + if self.avoid_throttling and speed and speed > peak_rate and time.time() - start > 1: peak_rate = speed # Initial throttling detection mechanism. @@ -295,20 +295,20 @@ class HttpFD(FileDownloader): # threshold is set to twice the throttled data rate # max block size is set to the power of two closest to the throttled data rate if self.avoid_throttling and not throttling_threshold and peak_rate and block_rate <= peak_rate * 0.7: - throttling_size += block_size if self.params.get('verbose', False): self.to_screen(("\n[throttling] Throttling started or is continuing, block rate = %.2fKiB/s, " "peak rate = %.2fKiB/s") % (block_rate / 1024, peak_rate / 1024)) if not throttling_start: throttling_start = block_start + throttling_start_size = byte_counter - block_size if time.time() - throttling_start >= 3: - throttling_rate = throttling_size / (time.time() - throttling_start) + throttling_rate = (byte_counter - throttling_start_size) / (time.time() - throttling_start) if throttling_rate > peak_rate * 0.7: if self.params.get('verbose', False): self.to_screen(("[throttling] Wasn't a throttle, temporary network hiccup " "(current rate = %.3f, peak rate = %.3f.") % (throttling_rate, peak_rate)) throttling_start = None - throttling_size = 0 + throttling_start_size = 0 else: block_size_limit = 1 while block_size_limit < int(throttling_rate / 1.5): From 68c3ec55970a5b85bd6c901e81299ecacf43c15a Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 15:34:22 +0800 Subject: [PATCH 4/9] extra line --- youtube_dl/downloader/http.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index d6096b792..4a4790505 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -18,6 +18,7 @@ from ..utils import ( XAttrUnavailableError, ) + class HttpFD(FileDownloader): def report_will_throttle(self): self.report_warning(("\r[download] This website does not support Content-Range header, " From d4c045a5cab24a3966764bd477744e088d35651e Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 19:21:04 +0800 Subject: [PATCH 5/9] block_size restriction on peak rate --- youtube_dl/downloader/http.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 4a4790505..ca41bc913 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -285,7 +285,7 @@ class HttpFD(FileDownloader): else: eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) - if self.avoid_throttling and speed and speed > peak_rate and time.time() - start > 1: + if self.avoid_throttling and speed and speed > peak_rate and time.time() - start > 1 and block_size >= 65536: peak_rate = speed # Initial throttling detection mechanism. @@ -307,7 +307,8 @@ class HttpFD(FileDownloader): if throttling_rate > peak_rate * 0.7: if self.params.get('verbose', False): self.to_screen(("[throttling] Wasn't a throttle, temporary network hiccup " - "(current rate = %.3f, peak rate = %.3f.") % (throttling_rate, peak_rate)) + "(current rate = %.2fKiB/s, peak rate = %.2fKiB/s.") % ( + throttling_rate / 1024, peak_rate / 1024)) throttling_start = None throttling_start_size = 0 else: From d9c022120482ba561eb5cf9c2df2cc1cdb47cbe3 Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 20:48:57 +0800 Subject: [PATCH 6/9] fix division by zero for block_rate calculation --- youtube_dl/downloader/http.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index ca41bc913..abef8ad66 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -233,7 +233,11 @@ class HttpFD(FileDownloader): block_start = time.time() data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) byte_counter += len(data_block) - block_rate = block_size / (time.time() - block_start) + block_time = time.time() - block_start + if block_time != 0: + block_rate = block_size / block_time + else: + block_rate = float('+inf') # exit loop when download is finished if len(data_block) == 0: From 7e1ac8ed0aea2bd621e449bc6f68b4ad48dd1a64 Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 21:05:25 +0800 Subject: [PATCH 7/9] guard time after throttle --- youtube_dl/downloader/http.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index abef8ad66..5ac132773 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -31,6 +31,7 @@ class HttpFD(FileDownloader): # HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop. # errors after serveral reconnections on some websites (e.g. vk.com is fine with reusing the same # request, but pornhub.com is not) + throttled = False if block_rate < peak_rate * threshold: if self.params.get('verbose', False): last_range = request.headers.get('Range') @@ -42,6 +43,7 @@ class HttpFD(FileDownloader): request = sanitized_Request(request.full_url, None, request.headers) try: new_data = self.ydl.urlopen(request) + throttled = True except Exception as e: self.report_warning("\r[download] Error when making a new request to avoid throttling, keeping previous connection and disabling this feature.") self.report_warning("\r[download] %s" % e) @@ -51,7 +53,7 @@ class HttpFD(FileDownloader): data.close() # just to be safe else: new_data = data - return new_data + return new_data, throttled def real_download(self, filename, info_dict): url = info_dict['url'] @@ -225,7 +227,7 @@ class HttpFD(FileDownloader): now = None # needed for slow_down() in the first loop run before = start # start measuring peak_rate = 0 - throttling_start = None + throttling_start = time.time() throttling_threshold = None throttling_start_size = 0 while True: @@ -327,8 +329,11 @@ class HttpFD(FileDownloader): throttling_rate / 1024, throttling_threshold, block_size_limit / 1024), True) # We need max speed! - if self.avoid_throttling and throttling_threshold and peak_rate and byte_counter != data_len: - data = self.speed_up(data, request, peak_rate, block_rate, byte_counter, throttling_threshold) + if (self.avoid_throttling and throttling_threshold and peak_rate and + byte_counter != data_len and time.time() - throttling_start > 1): + data, throttled = self.speed_up(data, request, peak_rate, block_rate, byte_counter, throttling_threshold) + if throttled: + throttling_start = block_start self._hook_progress({ 'status': 'downloading', From d324742a17598f1b6de5db63b6858ef62355dfce Mon Sep 17 00:00:00 2001 From: arichi Date: Sun, 18 Dec 2016 21:08:54 +0800 Subject: [PATCH 8/9] throttling_start should start with None --- youtube_dl/downloader/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 5ac132773..f9cd23a2d 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -227,7 +227,7 @@ class HttpFD(FileDownloader): now = None # needed for slow_down() in the first loop run before = start # start measuring peak_rate = 0 - throttling_start = time.time() + throttling_start = None throttling_threshold = None throttling_start_size = 0 while True: From 50e8b765a1199c00817483bffefbf5e7c8f1bc53 Mon Sep 17 00:00:00 2001 From: Sergey Korabanov Date: Mon, 2 Jan 2017 21:38:52 +0800 Subject: [PATCH 9/9] fix Request.full_url incompatibility with python2 --- youtube_dl/downloader/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index f9cd23a2d..fad3b3ed9 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -40,7 +40,7 @@ class HttpFD(FileDownloader): "(peak rate = %.2fKiB/s, block rate = %.2fKiB/s, downloaded %.0fKiB before throttling)") % ( peak_rate / 1024, block_rate / 1024, (byte_counter - last_range_start) / 1024)) request.add_header('Range', 'bytes=%d-' % byte_counter) - request = sanitized_Request(request.full_url, None, request.headers) + request = sanitized_Request(request.get_full_url(), None, request.headers) try: new_data = self.ydl.urlopen(request) throttled = True