diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3e1ff1536..bd9e21983 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.23 +[debug] youtube-dl version 2017.09.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a60bd5fc8..c439c8ef9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,43 @@ +version 2017.09.02 + +Extractors +* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, + #14077, #14079, #14082, #14083, #14094, #14095, #14096) +* [youtube] Fix upload date extraction (#14065) ++ [charlierose] Add support for episodes (#14062) ++ [bbccouk] Add support for w-prefixed ids (#14056) +* [googledrive] Extend URL regular expression (#9785) ++ [googledrive] Add support for source format (#14046) +* [pornhd] Fix extraction (#14005) + + +version 2017.08.27.1 + +Extractors + +* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) + + +version 2017.08.27 + +Core ++ [extractor/common] Extract height and format id for HTML5 videos (#14034) +* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, + #8625, #9483) + * Simplify code and split into separate routines to facilitate maintaining + * Make retry mechanism work on errors during actual download not only + during connection establishment phase + * Retry on ECONNRESET and ETIMEDOUT during reading data from network + * Retry on content too short + * Show error description on retry + +Extractors +* [generic] Lower preference for extraction from LD-JSON +* [rai] Fix audio formats extraction (#14024) +* [youtube] Fix controversy videos extraction (#14027, #14029) +* [mixcloud] Fix extraction (#14015, #14020) + + version 2017.08.23 Core diff --git a/Makefile b/Makefile index 41e3a683a..c74eea792 100644 --- a/Makefile +++ b/Makefile @@ -49,11 +49,11 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py mkdir -p zip for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \ mkdir -p zip/$$d ;\ - cp -a $$d/*.py zip/$$d/ ;\ + cp -pPR $$d/*.py zip/$$d/ ;\ done touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py mv zip/youtube_dl/__main__.py zip/ - cd zip ; zip --quiet ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py + cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py rm -rf zip echo '#!$(PYTHON)' > youtube-dl cat youtube-dl.zip >> youtube-dl diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 77242dacc..75b8166c5 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -304,11 +304,11 @@ class FileDownloader(object): """Report attempt to resume at given byte.""" self.to_screen('[download] Resuming download at byte %s' % resume_len) - def report_retry(self, count, retries): + def report_retry(self, err, count, retries): """Report retry in case of HTTP error 5xx""" self.to_screen( - '[download] Got server HTTP error. Retrying (attempt %d of %s)...' - % (count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' + % (error_to_compat_str(err), count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index af405b950..8a6638cc2 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -22,8 +22,16 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): url = info_dict['url'] - tmpfilename = self.temp_name(filename) - stream = None + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} @@ -38,46 +46,51 @@ class HttpFD(FileDownloader): if is_test: request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) - # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): - resume_len = os.path.getsize(encodeFilename(tmpfilename)) - else: - resume_len = 0 + ctx.open_mode = 'wb' + ctx.resume_len = 0 - open_mode = 'wb' - if resume_len != 0: - if self.params.get('continuedl', True): - self.report_resuming_byte(resume_len) - request.add_header('Range', 'bytes=%d-' % resume_len) - open_mode = 'ab' - else: - resume_len = 0 + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) count = 0 retries = self.params.get('retries', 0) - while count <= retries: + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + def establish_connection(): + if ctx.resume_len != 0: + self.report_resuming_byte(ctx.resume_len) + request.add_header('Range', 'bytes=%d-' % ctx.resume_len) + ctx.open_mode = 'ab' # Establish connection try: - data = self.ydl.urlopen(request) + ctx.data = self.ydl.urlopen(request) # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if resume_len > 0: - content_range = data.headers.get('Content-Range') + if ctx.resume_len > 0: + content_range = ctx.data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) # Content-Range is present and matches requested Range, resume is possible - if content_range_m and resume_len == int(content_range_m.group(1)): - break + if content_range_m and ctx.resume_len == int(content_range_m.group(1)): + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: # Unexpected HTTP error @@ -86,15 +99,15 @@ class HttpFD(FileDownloader): # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header - data = self.ydl.urlopen(basic_request) - content_length = data.info()['Content-Length'] + ctx.data = self.ydl.urlopen(basic_request) + content_length = ctx.data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: raise else: # Examine the reported length if (content_length is not None and - (resume_len - 100 < int(content_length) < resume_len + 100)): + (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -102,152 +115,184 @@ class HttpFD(FileDownloader): # I decided to implement a suggested change and consider the file # completely downloaded if the file size differs less than 100 bytes from # the one in the hard drive. - self.report_file_already_downloaded(filename) - self.try_rename(tmpfilename, filename) + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) self._hook_progress({ - 'filename': filename, + 'filename': ctx.filename, 'status': 'finished', - 'downloaded_bytes': resume_len, - 'total_bytes': resume_len, + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, }) - return True + raise SucceedDownload() else: # The length does not match, we start the download over self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break - except socket.error as e: - if e.errno != errno.ECONNRESET: + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: # Connection reset is no problem, just retry raise + raise RetryDownload(err) - # Retry - count += 1 - if count <= retries: - self.report_retry(count, retries) + def download(): + data_len = ctx.data.info().get('Content-length', None) - if count > retries: - self.report_error('giving up after %s retries' % retries) - return False + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE - data_len = data.info().get('Content-length', None) - - # Range HTTP header may be ignored/unsupported by a webserver - # (e.g. extractor/scivee.py, extractor/bambuser.py). - # However, for a test we still would like to download just a piece of a file. - # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control - # block size when downloading a file. - if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): - data_len = self._TEST_FILE_SIZE - - if data_len is not None: - data_len = int(data_len) + resume_len - min_data_len = self.params.get('min_filesize') - max_data_len = self.params.get('max_filesize') - if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) - return False - if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) - return False - - byte_counter = 0 + resume_len - block_size = self.params.get('buffersize', 1024) - start = time.time() - - # measure time over whole while-loop, so slow_down() and best_block_size() work together properly - now = None # needed for slow_down() in the first loop run - before = start # start measuring - while True: - - # Download and write - data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - byte_counter += len(data_block) - - # exit loop when download is finished - if len(data_block) == 0: - break - - # Open destination file just in time - if stream is None: - try: - (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) - assert stream is not None - filename = self.undo_temp_name(tmpfilename) - self.report_destination(filename) - except (OSError, IOError) as err: - self.report_error('unable to open for writing: %s' % str(err)) + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - if self.params.get('xattr_set_filesize', False) and data_len is not None: + byte_counter = 0 + ctx.resume_len + block_size = self.params.get('buffersize', 1024) + start = time.time() + + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring + + def retry(e): + if ctx.tmpfilename != '-': + ctx.stream.close() + ctx.stream = None + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) + + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): + raise + retry(e) + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: try: - write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) - except (XAttrUnavailableError, XAttrMetadataError) as err: - self.report_error('unable to set filesize xattr: %s' % str(err)) + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False - try: - stream.write(data_block) - except (IOError, OSError) as err: + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) + + try: + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) + return False + + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - start, + }) + + if is_test and byte_counter == data_len: + break + + if ctx.stream is None: self.to_stderr('\n') - self.report_error('unable to write data: %s' % str(err)) + self.report_error('Did not get any data blocks') return False + if ctx.tmpfilename != '-': + ctx.stream.close() - # Apply rate limit - self.slow_down(start, now, byte_counter - resume_len) + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err - # end measuring of one loop run - now = time.time() - after = now + self.try_rename(ctx.tmpfilename, ctx.filename) - # Adjust block size - if not self.params.get('noresizebuffer', False): - block_size = self.best_block_size(after - before, len(data_block)) - - before = after - - # Progress message - speed = self.calc_speed(start, now, byte_counter - resume_len) - if data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) self._hook_progress({ - 'status': 'downloading', 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'speed': speed, - 'elapsed': now - start, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - start, }) - if is_test and byte_counter == data_len: - break + return True - if stream is None: - self.to_stderr('\n') - self.report_error('Did not get any data blocks') - return False - if tmpfilename != '-': - stream.close() + while count <= retries: + try: + establish_connection() + download() + return True + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + continue + except SucceedDownload: + return True - if data_len is not None and byte_counter != data_len: - raise ContentTooShortError(byte_counter, int(data_len)) - self.try_rename(tmpfilename, filename) - - # Update file modification time - if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) - - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - start, - }) - - return True + self.report_error('giving up after %s retries' % retries) + return False diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 911ae6780..8b20c03d6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -29,7 +29,7 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pb][\da-z]{7}' + _ID_REGEX = r'[pbw][\da-z]{7}' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -233,6 +233,9 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index 2d517f231..42c9af263 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -5,7 +5,7 @@ from ..utils import remove_end class CharlieRoseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P\d+)' _TESTS = [{ 'url': 'https://charlierose.com/videos/27996', 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', @@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor): }, { 'url': 'https://charlierose.com/videos/27996', 'only_matching': True, + }, { + 'url': 'https://charlierose.com/episodes/30887?autoplay=true', + 'only_matching': True, }] _PLAYER_BASE = 'https://charlierose.com/video/player/%s' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1804c4de0..74d30ec50 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2184,6 +2184,12 @@ class InfoExtractor(object): f = parse_content_type(source_attributes.get('type')) is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: + # res attribute is not standard but seen several times + # in the wild + f.update({ + 'height': int_or_none(source_attributes.get('res')), + 'format_id': source_attributes.get('label'), + }) f.update(formats[0]) media_info['formats'].append(f) else: diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 49b00b87e..c81efdc00 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2871,12 +2871,6 @@ class GenericIE(InfoExtractor): merged[k] = v return merged - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - return merge_dicts(json_ld, info_dict) - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -2895,6 +2889,12 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False, base_url=url) return merge_dicts(info, info_dict) + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + return merge_dicts(json_ld, info_dict) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 97ff28219..3bf462d63 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, lowercase_escape, @@ -12,27 +13,53 @@ from ..utils import ( class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28,})' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:docs|drive)\.google\.com/ + (?: + (?:uc|open)\?.*?id=| + file/d/ + )| + video\.google\.com/get_player\?.*?docid= + ) + (?P[a-zA-Z0-9_-]{28,}) + ''' _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'md5': 'd109872761f7e7ecf353fa108c0dbe1e', + 'md5': '5c602afbbf2c1db91831f5d82f678554', 'info_dict': { 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', 'duration': 45, } + }, { + # video can't be watched anonymously due to view count limit reached, + # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046) + 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', + 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', + 'info_dict': { + 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', + 'ext': 'mp4', + 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', + } }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'md5': 'c230c67252874fddd8170e3fd1a45886', 'info_dict': { 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', 'ext': 'mp4', 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', 'duration': 189, }, - 'only_matching': True + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, }] _FORMATS_EXT = { '5': 'flv', @@ -147,47 +174,84 @@ class GoogleDriveIE(InfoExtractor): webpage = self._download_webpage( 'http://docs.google.com/file/d/%s' % video_id, video_id) - reason = self._search_regex( - r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason) - - title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + title = self._search_regex( + r'"title"\s*,\s*"([^"]+)', webpage, 'title', + default=None) or self._og_search_title(webpage) duration = int_or_none(self._search_regex( r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, - 'fmt stream map').split(',') - fmt_list = self._search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') - - resolutions = {} - for fmt in fmt_list: - mobj = re.search( - r'^(?P\d+)/(?P\d+)[xX](?P\d+)', fmt) - if mobj: - resolutions[mobj.group('format_id')] = ( - int(mobj.group('width')), int(mobj.group('height'))) formats = [] - for fmt_stream in fmt_stream_map: - fmt_stream_split = fmt_stream.split('|') - if len(fmt_stream_split) < 2: - continue - format_id, format_url = fmt_stream_split[:2] - f = { - 'url': lowercase_escape(format_url), - 'format_id': format_id, - 'ext': self._FORMATS_EXT[format_id], - } - resolution = resolutions.get(format_id) - if resolution: - f.update({ - 'width': resolution[0], - 'height': resolution[1], + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, + 'fmt stream map', default='').split(',') + fmt_list = self._search_regex( + r'"fmt_list"\s*,\s*"([^"]+)', webpage, + 'fmt_list', default='').split(',') + if fmt_stream_map and fmt_list: + resolutions = {} + for fmt in fmt_list: + mobj = re.search( + r'^(?P\d+)/(?P\d+)[xX](?P\d+)', fmt) + if mobj: + resolutions[mobj.group('format_id')] = ( + int(mobj.group('width')), int(mobj.group('height'))) + + for fmt_stream in fmt_stream_map: + fmt_stream_split = fmt_stream.split('|') + if len(fmt_stream_split) < 2: + continue + format_id, format_url = fmt_stream_split[:2] + f = { + 'url': lowercase_escape(format_url), + 'format_id': format_id, + 'ext': self._FORMATS_EXT[format_id], + } + resolution = resolutions.get(format_id) + if resolution: + f.update({ + 'width': resolution[0], + 'height': resolution[1], + }) + formats.append(f) + + source_url = update_url_query( + 'https://drive.google.com/uc', { + 'id': video_id, + 'export': 'download', + }) + urlh = self._request_webpage( + source_url, video_id, note='Requesting source file', + errnote='Unable to request source file', fatal=False) + if urlh: + def add_source_format(src_url): + formats.append({ + 'url': src_url, + 'ext': determine_ext(title, 'mp4').lower(), + 'format_id': 'source', + 'quality': 1, }) - formats.append(f) + if urlh.headers.get('Content-Disposition'): + add_source_format(source_url) + else: + confirmation_webpage = self._webpage_read_content( + urlh, url, video_id, note='Downloading confirmation page', + errnote='Unable to confirm download', fatal=False) + if confirmation_webpage: + confirm = self._search_regex( + r'confirm=([^&"\']+)', confirmation_webpage, + 'confirmation code', fatal=False) + if confirm: + add_source_format(update_url_query(source_url, { + 'confirm': confirm, + })) + + if not formats: + reason = self._search_regex( + r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason, expected=True) + self._sort_formats(formats) hl = self._search_regex( diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 25d3fc4b3..f6360cce6 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -92,7 +92,7 @@ class MixcloudIE(InfoExtractor): js = self._download_webpage(js_url, track_id, fatal=False) if js: KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' - for key_name in ('value', 'key_value', 'key_value_two'): + for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): key = self._search_regex( KEY_RE_TEMPLATE % key_name, js, 'key', default=None, group='key') diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 36761788d..b52879c7a 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor): r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)sources'?\s*:\s*(\{.+?\})\s*\}[;,)]", + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) if not sources: @@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'(\d+) views\s*<', webpage, 'view count', fatal=False)) thumbnail = self._search_regex( - r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) + r"poster'?\s*:\s*([\"'])(?P(?:(?!\1).)+)\1", webpage, + 'thumbnail', fatal=False, group='url') return { 'id': video_id, diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 3b40002a8..6bbc2781c 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -59,6 +59,7 @@ class RadioCanadaIE(InfoExtractor): device_types.append('android') formats = [] + error = None # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file for device_type in device_types: @@ -84,8 +85,8 @@ class RadioCanadaIE(InfoExtractor): if not v_url: continue if v_url == 'null': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, xpath_text(v_data, 'message')), expected=True) + error = xpath_text(v_data, 'message') + continue ext = determine_ext(v_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -129,6 +130,9 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_f4m_formats( base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) + if not formats and error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e11bf8f9a..5bf64a56b 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -345,11 +345,11 @@ class RaiIE(RaiBaseIE): media_type = media['type'] if 'Audio' in media_type: relinker_info = { - 'formats': { + 'formats': [{ 'format_id': media.get('formatoAudio'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), - } + }] } elif 'Video' in media_type: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 4adcd1830..a0abbae60 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,12 +4,14 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, + compat_HTTPError, compat_str, + compat_urlparse, ) from ..utils import ( - parse_duration, + ExtractorError, js_to_json, + parse_duration, parse_iso8601, ) @@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor): base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json( - '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), - lecture_id)['lecture'][0] + try: + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json( + e.cause.read().decode('utf-8'), lecture_id) + raise ExtractorError(msg['detail'], expected=True) + raise lecture_info = { 'id': lecture_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e71795e7..953e38227 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -245,6 +246,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _download_webpage(self, *args, **kwargs): + kwargs.setdefault('query', {})['disable_polymer'] = 'true' + return super(YoutubeBaseInfoExtractor, self)._download_webpage( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -1003,6 +1009,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Skipping DASH manifest', ], }, + { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + 'license': 'Standard YouTube License', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { # itag 212 'url': '1t24XAntNCY', @@ -1437,9 +1464,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + # Get video info embed_webpage = None - is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -1509,6 +1541,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue get_video_info = compat_parse_qs(video_info_webpage) add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) if not video_info: video_info = get_video_info if 'token' in get_video_info: @@ -1592,10 +1626,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self.playlist_result(entries, video_id, video_title, video_description) self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if 'view_count' in video_info: - view_count = int(video_info['view_count'][0]) - else: - view_count = None + if view_count is None: + view_count = extract_view_count(video_info) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1639,10 +1671,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not upload_date: upload_date = self._search_regex( [r'(?s)id="eow-date.*?>(.*?)', - r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)'], + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) - if upload_date: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -2028,7 +2058,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): | (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 94d35a66a..60ed35de9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.23' +__version__ = '2017.09.02'