diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c5eff009c..12e7f02ce 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,12 +6,13 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.27*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.27** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.02.03*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.02.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones +- [ ] Checked that provided video/audio/playlist URLs (if any) are alive and playable in a browser ### What is the purpose of your *issue*? - [ ] Bug report (encountered problems with youtube-dl) @@ -35,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.01.27 +[debug] youtube-dl version 2018.02.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md index 26f61d3b4..8edbd5a0f 100644 --- a/.github/ISSUE_TEMPLATE_tmpl.md +++ b/.github/ISSUE_TEMPLATE_tmpl.md @@ -12,6 +12,7 @@ ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones +- [ ] Checked that provided video/audio/playlist URLs (if any) are alive and playable in a browser ### What is the purpose of your *issue*? - [ ] Bug report (encountered problems with youtube-dl) diff --git a/ChangeLog b/ChangeLog index 00c5c9c6b..db212c4a1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version 2018.02.03 + +Core ++ Introduce --http-chunk-size for chunk-based HTTP downloading ++ Add support for IronPython +* [downloader/ism] Fix Python 3.2 support + +Extractors +* [redbulltv] Fix extraction (#15481) +* [redtube] Fix metadata extraction (#15472) +* [pladform] Respect platform id and extract HLS formats (#15468) +- [rtlnl] Remove progressive formats (#15459) +* [6play] Do no modify asset URLs with a token (#15248) +* [nationalgeographic] Relax URL regular expression +* [dplay] Relax URL regular expression (#15458) +* [cbsinteractive] Fix data extraction (#15451) ++ [amcnetworks] Add support for sundancetv.com (#9260) + + version 2018.01.27 Core diff --git a/README.md b/README.md index 7787a3f17..f8fc27823 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,11 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo size. By default, the buffer size is automatically resized from an initial value of SIZE. + --http-chunk-size SIZE Size of a chunk for chunk-based HTTP + downloading (e.g. 10485760 or 10M) (default + is disabled). May be useful for bypassing + bandwidth throttling imposed by a webserver + (experimental) --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with diff --git a/setup.cfg b/setup.cfg index 2dc06ffe4..5208f7ae2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,4 +3,4 @@ universal = True [flake8] exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git -ignore = E402,E501,E731 +ignore = E402,E501,E731,E741 diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py new file mode 100644 index 000000000..5cf2bf1a5 --- /dev/null +++ b/test/test_downloader_http.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import re +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import try_rm +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server +from youtube_dl.downloader.http import HttpFD +from youtube_dl.utils import encodeFilename +import ssl +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def http_server_port(httpd): + if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): + # In Jython SSLSocket is not a subclass of socket.socket + sock = httpd.socket.sock + else: + sock = httpd.socket + return sock.getsockname()[1] + + +TEST_SIZE = 10 * 1024 + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def send_content_range(self, total=None): + range_header = self.headers.get('Range') + start = end = None + if range_header: + mobj = re.search(r'^bytes=(\d+)-(\d+)', range_header) + if mobj: + start = int(mobj.group(1)) + end = int(mobj.group(2)) + valid_range = start is not None and end is not None + if valid_range: + content_range = 'bytes %d-%d' % (start, end) + if total: + content_range += '/%d' % total + self.send_header('Content-Range', content_range) + return (end - start + 1) if valid_range else total + + def serve(self, range=True, content_length=True): + self.send_response(200) + self.send_header('Content-Type', 'video/mp4') + size = TEST_SIZE + if range: + size = self.send_content_range(TEST_SIZE) + if content_length: + self.send_header('Content-Length', size) + self.end_headers() + self.wfile.write(b'#' * size) + + def do_GET(self): + if self.path == '/regular': + self.serve() + elif self.path == '/no-content-length': + self.serve(content_length=False) + elif self.path == '/no-range': + self.serve(range=False) + elif self.path == '/no-range-no-content-length': + self.serve(range=False, content_length=False) + else: + assert False + + +class FakeLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + pass + + +class TestHttpFD(unittest.TestCase): + def setUp(self): + self.httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + self.port = http_server_port(self.httpd) + self.server_thread = threading.Thread(target=self.httpd.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + def download(self, params, ep): + params['logger'] = FakeLogger() + ydl = YoutubeDL(params) + downloader = HttpFD(ydl, params) + filename = 'testfile.mp4' + try_rm(encodeFilename(filename)) + self.assertTrue(downloader.real_download(filename, { + 'url': 'http://127.0.0.1:%d/%s' % (self.port, ep), + })) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + try_rm(encodeFilename(filename)) + + def download_all(self, params): + for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): + self.download(params, ep) + + def test_regular(self): + self.download_all({}) + + def test_chunked(self): + self.download_all({ + 'http_chunk_size': 1000, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_http.py b/test/test_http.py index 7a7a3510f..409fec9c8 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -47,7 +47,7 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): self.end_headers() return - new_url = 'http://localhost:%d/中文.html' % http_server_port(self.server) + new_url = 'http://127.0.0.1:%d/中文.html' % http_server_port(self.server) self.send_response(302) self.send_header(b'Location', new_url.encode('utf-8')) self.end_headers() @@ -74,7 +74,7 @@ class FakeLogger(object): class TestHTTP(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( - ('localhost', 0), HTTPTestRequestHandler) + ('127.0.0.1', 0), HTTPTestRequestHandler) self.port = http_server_port(self.httpd) self.server_thread = threading.Thread(target=self.httpd.serve_forever) self.server_thread.daemon = True @@ -86,15 +86,15 @@ class TestHTTP(unittest.TestCase): return ydl = YoutubeDL({'logger': FakeLogger()}) - r = ydl.extract_info('http://localhost:%d/302' % self.port) - self.assertEqual(r['entries'][0]['url'], 'http://localhost:%d/vid.mp4' % self.port) + r = ydl.extract_info('http://127.0.0.1:%d/302' % self.port) + self.assertEqual(r['entries'][0]['url'], 'http://127.0.0.1:%d/vid.mp4' % self.port) class TestHTTPS(unittest.TestCase): def setUp(self): certfn = os.path.join(TEST_DIR, 'testcert.pem') self.httpd = compat_http_server.HTTPServer( - ('localhost', 0), HTTPTestRequestHandler) + ('127.0.0.1', 0), HTTPTestRequestHandler) self.httpd.socket = ssl.wrap_socket( self.httpd.socket, certfile=certfn, server_side=True) self.port = http_server_port(self.httpd) @@ -107,11 +107,11 @@ class TestHTTPS(unittest.TestCase): ydl = YoutubeDL({'logger': FakeLogger()}) self.assertRaises( Exception, - ydl.extract_info, 'https://localhost:%d/video.html' % self.port) + ydl.extract_info, 'https://127.0.0.1:%d/video.html' % self.port) ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) - r = ydl.extract_info('https://localhost:%d/video.html' % self.port) - self.assertEqual(r['entries'][0]['url'], 'https://localhost:%d/vid.mp4' % self.port) + r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) + self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) def _build_proxy_handler(name): @@ -132,23 +132,23 @@ def _build_proxy_handler(name): class TestProxy(unittest.TestCase): def setUp(self): self.proxy = compat_http_server.HTTPServer( - ('localhost', 0), _build_proxy_handler('normal')) + ('127.0.0.1', 0), _build_proxy_handler('normal')) self.port = http_server_port(self.proxy) self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) self.proxy_thread.daemon = True self.proxy_thread.start() self.geo_proxy = compat_http_server.HTTPServer( - ('localhost', 0), _build_proxy_handler('geo')) + ('127.0.0.1', 0), _build_proxy_handler('geo')) self.geo_port = http_server_port(self.geo_proxy) self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever) self.geo_proxy_thread.daemon = True self.geo_proxy_thread.start() def test_proxy(self): - geo_proxy = 'localhost:{0}'.format(self.geo_port) + geo_proxy = '127.0.0.1:{0}'.format(self.geo_port) ydl = YoutubeDL({ - 'proxy': 'localhost:{0}'.format(self.port), + 'proxy': '127.0.0.1:{0}'.format(self.port), 'geo_verification_proxy': geo_proxy, }) url = 'http://foo.com/bar' @@ -162,7 +162,7 @@ class TestProxy(unittest.TestCase): def test_proxy_with_idn(self): ydl = YoutubeDL({ - 'proxy': 'localhost:{0}'.format(self.port), + 'proxy': '127.0.0.1:{0}'.format(self.port), }) url = 'http://中文.tw/' response = ydl.urlopen(url).read().decode('utf-8') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ba684a075..9bb952457 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -191,6 +191,11 @@ def _real_main(argv=None): if numeric_buffersize is None: parser.error('invalid buffer size specified') opts.buffersize = numeric_buffersize + if opts.http_chunk_size is not None: + numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) + if not numeric_chunksize: + parser.error('invalid http chunk size specified') + opts.http_chunk_size = numeric_chunksize if opts.playliststart <= 0: raise ValueError('Playlist start must be positive') if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: @@ -346,6 +351,7 @@ def _real_main(argv=None): 'keep_fragments': opts.keep_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, + 'http_chunk_size': opts.http_chunk_size, 'continuedl': opts.continue_dl, 'noprogress': opts.noprogress, 'progress_with_newline': opts.progress_with_newline, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 27ece2d29..4a611f183 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2909,8 +2909,8 @@ else: if platform.python_implementation() == 'IronPython' and sys.version_info < (2, 7, 8): class compat_Struct(struct.Struct): def unpack(self, string): - if not isinstance(string, buffer): - string = buffer(string) + if not isinstance(string, buffer): # noqa: F821 + string = buffer(string) # noqa: F821 return super(compat_Struct, self).unpack(string) else: compat_Struct = struct.Struct diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 3ff26ff70..dc2b37beb 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -7,10 +7,14 @@ import time import re from .common import FileDownloader -from ..compat import compat_urllib_error +from ..compat import ( + compat_str, + compat_urllib_error, +) from ..utils import ( ContentTooShortError, encodeFilename, + int_or_none, sanitize_open, sanitized_Request, write_xattr, @@ -42,17 +46,22 @@ class HttpFD(FileDownloader): request = sanitized_Request(url, None, headers) is_test = self.params.get('test', False) - - if is_test: - request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + self.params.get('http_chunk_size') or 0) ctx.open_mode = 'wb' ctx.resume_len = 0 + ctx.data_len = None + ctx.block_size = self.params.get('buffersize', 1024) + ctx.start_time = time.time() if self.params.get('continuedl', True): # Establish possible resume length if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = os.path.getsize( + encodeFilename(ctx.tmpfilename)) + + ctx.is_resume = ctx.resume_len > 0 count = 0 retries = self.params.get('retries', 0) @@ -64,11 +73,33 @@ class HttpFD(FileDownloader): def __init__(self, source_error): self.source_error = source_error + class NextFragment(Exception): + pass + + def set_range(req, start, end): + range_header = 'bytes=%d-' % start + if end: + range_header += compat_str(end) + req.add_header('Range', range_header) + def establish_connection(): - if ctx.resume_len != 0: - self.report_resuming_byte(ctx.resume_len) - request.add_header('Range', 'bytes=%d-' % ctx.resume_len) + if ctx.resume_len > 0: + range_start = ctx.resume_len + if ctx.is_resume: + self.report_resuming_byte(ctx.resume_len) ctx.open_mode = 'ab' + elif chunk_size > 0: + range_start = 0 + else: + range_start = None + ctx.is_resume = False + range_end = range_start + chunk_size - 1 if chunk_size else None + if range_end and ctx.data_len is not None and range_end >= ctx.data_len: + range_end = ctx.data_len - 1 + has_range = range_start is not None + ctx.has_range = has_range + if has_range: + set_range(request, range_start, range_end) # Establish connection try: ctx.data = self.ydl.urlopen(request) @@ -77,25 +108,35 @@ class HttpFD(FileDownloader): # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if ctx.resume_len > 0: + if has_range: content_range = ctx.data.headers.get('Content-Range') if content_range: - content_range_m = re.search(r'bytes (\d+)-', content_range) + content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) # Content-Range is present and matches requested Range, resume is possible - if content_range_m and ctx.resume_len == int(content_range_m.group(1)): - return + if content_range_m: + if range_start == int(content_range_m.group(1)): + content_range_end = int_or_none(content_range_m.group(2)) + content_len = int_or_none(content_range_m.group(3)) + accept_content_len = ( + # Non-chunked download + not chunk_size or + # Chunked download and requested piece or + # its part is promised to be served + content_range_end == range_end or + content_len < range_end) + if accept_content_len: + ctx.data_len = content_len + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' + ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) return except (compat_urllib_error.HTTPError, ) as err: - if (err.code < 500 or err.code >= 600) and err.code != 416: - # Unexpected HTTP error - raise - elif err.code == 416: + if err.code == 416: # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header @@ -130,6 +171,15 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.open_mode = 'wb' return + elif err.code == 302: + if not chunk_size: + raise + # HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop. + # may happen during chunk downloading. This is usually fixed + # with a retry. + elif err.code < 500 or err.code >= 600: + # Unexpected HTTP error + raise raise RetryDownload(err) except socket.error as err: if err.errno != errno.ECONNRESET: @@ -160,7 +210,7 @@ class HttpFD(FileDownloader): return False byte_counter = 0 + ctx.resume_len - block_size = self.params.get('buffersize', 1024) + block_size = ctx.block_size start = time.time() # measure time over whole while-loop, so slow_down() and best_block_size() work together properly @@ -233,25 +283,30 @@ class HttpFD(FileDownloader): # Progress message speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - if data_len is None: + if ctx.data_len is None: eta = None else: - eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len) + eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) self._hook_progress({ 'status': 'downloading', 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, + 'total_bytes': ctx.data_len, 'tmpfilename': ctx.tmpfilename, 'filename': ctx.filename, 'eta': eta, 'speed': speed, - 'elapsed': now - start, + 'elapsed': now - ctx.start_time, }) if is_test and byte_counter == data_len: break + if not is_test and chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + ctx.resume_len = byte_counter + # ctx.block_size = block_size + raise NextFragment() + if ctx.stream is None: self.to_stderr('\n') self.report_error('Did not get any data blocks') @@ -276,7 +331,7 @@ class HttpFD(FileDownloader): 'total_bytes': byte_counter, 'filename': ctx.filename, 'status': 'finished', - 'elapsed': time.time() - start, + 'elapsed': time.time() - ctx.start_time, }) return True @@ -290,6 +345,8 @@ class HttpFD(FileDownloader): if count <= retries: self.report_retry(e.source_error, count, retries) continue + except NextFragment: + continue except SucceedDownload: return True diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index dd3b18d72..6fb3d6c53 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -11,7 +11,7 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', 'md5': '', @@ -51,6 +51,9 @@ class AMCNetworksIE(ThePlatformIE): }, { 'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3', 'only_matching': True, + }, { + 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 681d63e29..6596e98a6 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -75,10 +75,10 @@ class CBSInteractiveIE(CBSIE): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-(?:cnet|zdnet)-video(?:-uvp(?:js)?)?-options='([^']+)'", + r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) - vdata = data.get('video') or data['videos'][0] + vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] video_id = vdata['mpxRefId'] diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a08dace43..b73446773 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -26,7 +26,7 @@ from ..utils import ( class DPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?Pwww\.(?Pdplay\.(?Pdk|se|no)))/(?:videoer/)?(?P[^/]+/[^/?#]+)' + _VALID_URL = r'https?://(?Pwww\.(?Pdplay\.(?Pdk|se|no)))/(?:video(?:er|s)/)?(?P[^/]+/[^/?#]+)' _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -89,9 +89,12 @@ class DPlayIE(InfoExtractor): 'skip_download': True, }, }, { - # geo restricted, bypassable via X-Forwarded-For + 'url': 'https://www.dplay.dk/videoer/singleliv/season-5-episode-3', 'only_matching': True, + }, { + 'url': 'https://www.dplay.se/videos/sofias-anglar/sofias-anglar-1001', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 9e8d28f48..246f6795a 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -68,7 +68,7 @@ class NationalGeographicVideoIE(InfoExtractor): class NationalGeographicIE(ThePlatformIE, AdobePassIE): IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P[^/?]+)' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:wild/)?[^/]+/)?(?:videos|episodes)/(?P[^/?]+)' _TESTS = [ { @@ -102,6 +102,10 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): { 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index e38c7618e..e86c65396 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( + determine_ext, ExtractorError, int_or_none, xpath_text, @@ -26,17 +28,15 @@ class PladformIE(InfoExtractor): (?P\d+) ''' _TESTS = [{ - # http://muz-tv.ru/kinozal/view/7400/ - 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', - 'md5': '61f37b575dd27f1bb2e1854777fe31f4', + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', 'info_dict': { - 'id': '100183293', + 'id': '3777899', 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', - 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 694, - 'age_limit': 0, + 'duration': 3190, }, }, { 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', @@ -56,22 +56,48 @@ class PladformIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + pl = qs.get('pl', ['1'])[0] + video = self._download_xml( - 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, - video_id) + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }) + + def fail(text): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, text), + expected=True) if video.tag == 'error': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, video.text), - expected=True) + fail(video.text) quality = qualities(('ld', 'sd', 'hd')) - formats = [{ - 'url': src.text, - 'format_id': src.get('quality'), - 'quality': quality(src.get('quality')), - } for src in video.findall('./src')] + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + self._sort_formats(formats) webpage = self._download_webpage( diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 5d6cc3610..243603676 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -5,135 +5,93 @@ from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( float_or_none, - int_or_none, - try_get, - # unified_timestamp, ExtractorError, ) class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film|live)/(?:AP-\w+/segment/)?(?PAP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull\.tv/video/(?PAP-\w+)' _TESTS = [{ # film - 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', 'md5': 'fb0445b98aa4394e504b413d98031d1f', 'info_dict': { - 'id': 'AP-1Q756YYX51W11', + 'id': 'AP-1Q6XCDTAN1W11', 'ext': 'mp4', - 'title': 'ABC of...WRC', + 'title': 'ABC of... WRC - ABC of... S1E6', 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', 'duration': 1582.04, - # 'timestamp': 1488405786, - # 'upload_date': '20170301', }, }, { # episode - 'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web', + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', 'info_dict': { - 'id': 'AP-1PMT5JCWH1W11', + 'id': 'AP-1PMHKJFCW1W11', 'ext': 'mp4', - 'title': 'Grime - Hashtags S2 E4', - 'description': 'md5:334b741c8c1ce65be057eab6773c1cf5', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:b5f522b89b72e1e23216e5018810bb25', 'duration': 904.6, - # 'timestamp': 1487290093, - # 'upload_date': '20170217', - 'series': 'Hashtags', - 'season_number': 2, - 'episode_number': 4, }, 'params': { 'skip_download': True, }, - }, { - # segment - 'url': 'https://www.redbull.tv/live/AP-1R5DX49XS1W11/segment/AP-1QSAQJ6V52111/semi-finals', - 'info_dict': { - 'id': 'AP-1QSAQJ6V52111', - 'ext': 'mp4', - 'title': 'Semi Finals - Vans Park Series Pro Tour', - 'description': 'md5:306a2783cdafa9e65e39aa62f514fd97', - 'duration': 11791.991, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', - 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) session = self._download_json( - 'https://api-v2.redbull.tv/session', video_id, + 'https://api.redbull.tv/v3/session', video_id, note='Downloading access token', query={ - 'build': '4.370.0', 'category': 'personal_computer', - 'os_version': '1.0', 'os_family': 'http', }) if session.get('code') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, session['message'])) - auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token']) + token = session['token'] try: - info = self._download_json( - 'https://api-v2.redbull.tv/content/%s' % video_id, + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, video_id, note='Downloading video information', - headers={'Authorization': auth} + headers={'Authorization': token} ) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: error_message = self._parse_json( - e.cause.read().decode(), video_id)['message'] + e.cause.read().decode(), video_id)['error'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) raise - video = info['video_product'] - - title = info['title'].strip() + title = video['title'].strip() formats = self._extract_m3u8_formats( - video['url'], video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) subtitles = {} - for _, captions in (try_get( - video, lambda x: x['attachments']['captions'], - dict) or {}).items(): - if not captions or not isinstance(captions, list): - continue - for caption in captions: - caption_url = caption.get('url') - if not caption_url: - continue - ext = caption.get('format') - if ext == 'xml': - ext = 'ttml' - subtitles.setdefault(caption.get('lang') or 'en', []).append({ - 'url': caption_url, - 'ext': ext, - }) + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) - subheading = info.get('subheading') + subheading = video.get('subheading') if subheading: title += ' - %s' % subheading return { 'id': video_id, 'title': title, - 'description': info.get('long_description') or info.get( + 'description': video.get('long_description') or video.get( 'short_description'), 'duration': float_or_none(video.get('duration'), scale=1000), - # 'timestamp': unified_timestamp(info.get('published')), - 'series': info.get('show_title'), - 'season_number': int_or_none(info.get('season_number')), - 'episode_number': int_or_none(info.get('episode_number')), 'formats': formats, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index f70a75256..843e45d36 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -46,9 +46,10 @@ class RedTubeIE(InfoExtractor): raise ExtractorError('Video %s has been removed' % video_id, expected=True) title = self._html_search_regex( - (r'

(?P.+?)</h1>', - r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), - webpage, 'title', group='title') + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) formats = [] sources = self._parse_json( @@ -87,12 +88,13 @@ class RedTubeIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( - r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + r'<span[^>]+>ADDED ([^<]+)<', webpage, 'upload date', fatal=False)) duration = int_or_none(self._search_regex( r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( - r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)'), webpage, 'view count', fatal=False)) # No self-labeling, but they describe themselves as diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index bba25a233..be36acc46 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -93,58 +93,11 @@ class RtlNlIE(InfoExtractor): meta = info.get('meta', {}) - # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. - # To workaround this previously adaptive -> flash trick was used to obtain - # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) - # and bypass georestrictions as well. - # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore - # unusable albeit can be fixed by simple string replacement (see - # https://github.com/rg3/youtube-dl/pull/6337) - # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted - # streams are used now. videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats( m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) - - video_urlpart = videopath.split('/adaptive/')[1][:-5] - PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' - - PG_FORMATS = ( - ('a2t', 512, 288), - ('a3t', 704, 400), - ('nettv', 1280, 720), - ) - - def pg_format(format_id, width, height): - return { - 'url': PG_URL_TEMPLATE % (format_id, video_urlpart), - 'format_id': 'pg-%s' % format_id, - 'protocol': 'http', - 'width': width, - 'height': height, - } - - if not formats: - formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS] - else: - pg_formats = [] - for format_id, width, height in PG_FORMATS: - try: - # Find hls format with the same width and height corresponding - # to progressive format and copy metadata from it. - f = next(f for f in formats if f.get('height') == height) - # hls formats may have invalid width - f['width'] = width - f_copy = f.copy() - f_copy.update(pg_format(format_id, width, height)) - pg_formats.append(f_copy) - except StopIteration: - # Missing hls format does mean that no progressive format with - # such width and height exists either. - pass - formats.extend(pg_formats) self._sort_formats(formats) thumbnails = [] diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index cf32d1e0c..6d4e3b76d 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -159,7 +159,6 @@ class SeznamZpravyArticleIE(InfoExtractor): webpage = self._download_webpage(url, article_id) info = self._search_json_ld(webpage, article_id, default={}) - print(info) title = info.get('title') or self._og_search_title(webpage, fatal=False) description = info.get('description') or self._og_search_description(webpage) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 547be8f95..d435f7157 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, int_or_none, @@ -57,7 +61,7 @@ class SixPlayIE(InfoExtractor): container = asset.get('video_container') ext = determine_ext(asset_url) if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp': + if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 97ff422f0..1ca310b90 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -158,7 +158,6 @@ class SoundcloudIE(InfoExtractor): ] _CLIENT_ID = 'DQskPX1pntALRzMp4HSxya3Mc0AO66Ro' - _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod def _extract_urls(webpage): diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 4c0455044..7d1bbc021 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -478,6 +478,11 @@ def parseOpts(overrideArguments=None): '--no-resize-buffer', action='store_true', dest='noresizebuffer', default=False, help='Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.') + downloader.add_option( + '--http-chunk-size', + dest='http_chunk_size', metavar='SIZE', default=None, + help='Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' + 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)') downloader.add_option( '--test', action='store_true', dest='test', default=False, diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8a2b57ffb..7ae919523 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.01.27' +__version__ = '2018.02.03'