From 47e0cef46e9a76e589a2b1cde1b2dfa8bcf01d8e Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sun, 16 Apr 2017 00:34:34 +0200 Subject: [PATCH 001/238] [openload] rewrite extractor --- youtube_dl/extractor/openload.py | 110 +++++++++++++++++++------------ 1 file changed, 67 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d8036b54a..789bf997e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,12 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import os import re +import subprocess +import tempfile from .common import InfoExtractor -from ..compat import compat_chr from ..utils import ( + check_executable, determine_ext, + encodeArgument, ExtractorError, ) @@ -58,6 +62,39 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] + _PHANTOMJS_SCRIPT = r''' + phantom.onError = function(msg, trace) { + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) { + msgStack.push('TRACE:'); + trace.forEach(function(t) { + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }); + } + console.error(msgStack.join('\n')); + phantom.exit(1); + }; + var page = require('webpage').create(); + page.settings.resourceTimeout = 10000; + page.onInitialized = function() { + page.evaluate(function() { + delete window._phantom; + delete window.callPhantom; + }); + }; + page.open('https://openload.co/embed/%s/', function(status) { + var info = page.evaluate(function() { + return { + decoded_id: document.getElementById('streamurl').innerHTML, + title: document.querySelector('meta[name="og:title"],' + + 'meta[name=description]').content + }; + }); + console.log(info.decoded_id + ' ' + info.title); + phantom.exit(); + });''' + @staticmethod def _extract_urls(webpage): return re.findall( @@ -65,61 +102,48 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): + exe = check_executable('phantomjs', ['-v']) + if not exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + video_id = self._match_id(url) - webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) + url = 'https://openload.co/embed/%s/' % video_id + webpage = self._download_webpage(url, video_id) if 'File not found' in webpage or 'deleted by the owner' in webpage: - raise ExtractorError('File not found', expected=True) + raise ExtractorError('File not found', expected=True, video_id=video_id) - ol_id = self._search_regex( - ']+id="[^"]+"[^>]*>([0-9A-Za-z]+)', - webpage, 'openload ID') + script_file = tempfile.NamedTemporaryFile(mode='w', delete=False) - decoded = '' - a = ol_id[0:24] - b = [] - for i in range(0, len(a), 8): - b.append(int(a[i:i + 8] or '0', 16)) - ol_id = ol_id[24:] - j = 0 - k = 0 - while j < len(ol_id): - c = 128 - d = 0 - e = 0 - f = 0 - _more = True - while _more: - if j + 1 >= len(ol_id): - c = 143 - f = int(ol_id[j:j + 2] or '0', 16) - j += 2 - d += (f & 127) << e - e += 7 - _more = f >= c - g = d ^ b[k % 3] - for i in range(4): - char_dec = (g >> 8 * i) & (c + 127) - char = compat_chr(char_dec) - if char != '#': - decoded += char - k += 1 + # write JS script to file and close it + with script_file: + script_file.write(self._PHANTOMJS_SCRIPT % video_id) - video_url = 'https://openload.co/stream/%s?mime=true' - video_url = video_url % decoded + self.to_screen('%s: Decoding video ID with PhantomJS' % video_id) - title = self._og_search_title(webpage, default=None) or self._search_regex( - r']+class=["\']title["\'][^>]*>([^<]+)', webpage, - 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', fatal=True) + p = subprocess.Popen([exe, '--ssl-protocol=any', script_file.name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Decoding failed\n:' + + encodeArgument(err)) + else: + decoded_id, title = encodeArgument(output).strip().split(' ', 1) + + os.remove(script_file.name) + + video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id entries = self._parse_html5_media_entries(url, webpage, video_id) - subtitles = entries[0]['subtitles'] if entries else None + entry = entries[0] if entries else {} + subtitles = entry.get('subtitles') info_dict = { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, # Seems all videos have extensions in their titles 'ext': determine_ext(title, 'mp4'), From da57ebaf84225240b356530cdf02d12596f0dce8 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Tue, 25 Apr 2017 01:06:14 +0200 Subject: [PATCH 002/238] [openload] separate PhantomJS code from extractor --- youtube_dl/extractor/openload.py | 78 ++++------------- youtube_dl/utils.py | 141 +++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 789bf997e..ac5e0bb08 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,17 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import os import re -import subprocess -import tempfile from .common import InfoExtractor from ..utils import ( - check_executable, determine_ext, - encodeArgument, ExtractorError, + get_element_by_id, + PhantomJSwrapper, ) @@ -62,38 +59,7 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] - _PHANTOMJS_SCRIPT = r''' - phantom.onError = function(msg, trace) { - var msgStack = ['PHANTOM ERROR: ' + msg]; - if(trace && trace.length) { - msgStack.push('TRACE:'); - trace.forEach(function(t) { - msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line - + (t.function ? ' (in function ' + t.function +')' : '')); - }); - } - console.error(msgStack.join('\n')); - phantom.exit(1); - }; - var page = require('webpage').create(); - page.settings.resourceTimeout = 10000; - page.onInitialized = function() { - page.evaluate(function() { - delete window._phantom; - delete window.callPhantom; - }); - }; - page.open('https://openload.co/embed/%s/', function(status) { - var info = page.evaluate(function() { - return { - decoded_id: document.getElementById('streamurl').innerHTML, - title: document.querySelector('meta[name="og:title"],' - + 'meta[name=description]').content - }; - }); - console.log(info.decoded_id + ' ' + info.title); - phantom.exit(); - });''' + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @staticmethod def _extract_urls(webpage): @@ -102,40 +68,27 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): - exe = check_executable('phantomjs', ['-v']) - if not exe: - raise ExtractorError('PhantomJS executable not found in PATH, ' - 'download it from http://phantomjs.org', - expected=True) - video_id = self._match_id(url) url = 'https://openload.co/embed/%s/' % video_id - webpage = self._download_webpage(url, video_id) + headers = { + 'User-Agent': self._USER_AGENT, + } + + phantom = PhantomJSwrapper(self) + webpage, _ = phantom.get(url, video_id=video_id, headers=headers) if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) - script_file = tempfile.NamedTemporaryFile(mode='w', delete=False) - - # write JS script to file and close it - with script_file: - script_file.write(self._PHANTOMJS_SCRIPT % video_id) - - self.to_screen('%s: Decoding video ID with PhantomJS' % video_id) - - p = subprocess.Popen([exe, '--ssl-protocol=any', script_file.name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output, err = p.communicate() - if p.returncode != 0: - raise ExtractorError('Decoding failed\n:' - + encodeArgument(err)) - else: - decoded_id, title = encodeArgument(output).strip().split(' ', 1) - - os.remove(script_file.name) + decoded_id = get_element_by_id('streamurl', webpage) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id + title = self._og_search_title(webpage, default=None) or self._search_regex( + r']+class=["\']title["\'][^>]*>([^<]+)', webpage, + 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', fatal=True) + entries = self._parse_html5_media_entries(url, webpage, video_id) entry = entries[0] if entries else {} subtitles = entry.get('subtitles') @@ -148,5 +101,6 @@ class OpenloadIE(InfoExtractor): # Seems all videos have extensions in their titles 'ext': determine_ext(title, 'mp4'), 'subtitles': subtitles, + 'http_headers': headers, } return info_dict diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2340bc306..94e1b07a6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3652,3 +3652,144 @@ def write_xattr(path, key, value): "Couldn't find a tool to set the xattrs. " "Install either the python 'xattr' module, " "or the 'xattr' binary.") + + +class PhantomJSwrapper(object): + """PhantomJS wrapper class""" + + _TEMPLATE = r''' + phantom.onError = function(msg, trace) {{ + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) {{ + msgStack.push('TRACE:'); + trace.forEach(function(t) {{ + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }}); + }} + console.error(msgStack.join('\n')); + phantom.exit(1); + }}; + var page = require('webpage').create(); + var fs = require('fs'); + var read = {{ mode: 'r', charset: 'utf-8' }}; + var write = {{ mode: 'w', charset: 'utf-8' }}; + page.settings.resourceTimeout = {timeout}; + page.settings.userAgent = "{ua}"; + page.onLoadStarted = function() {{ + page.evaluate(function() {{ + delete window._phantom; + delete window.callPhantom; + }}); + }}; + var saveAndExit = function() {{ + fs.write("{html}", page.content, write); + phantom.exit(); + }}; + page.onLoadFinished = function(status) {{ + if(page.url === "") {{ + page.setContent(fs.read("{html}", read), "{url}"); + }} + else {{ + {jscode} + }} + }}; + page.open(""); + ''' + + _TMP_FILE_NAMES = ['script', 'html'] + + def __init__(self, extractor, timeout=10000): + self.exe = check_executable('phantomjs', ['-v']) + if not self.exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + self.extractor = extractor + self.options = { + 'timeout': timeout, + } + self._TMP_FILES = {} + for name in self._TMP_FILE_NAMES: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + self._TMP_FILES[name] = tmp + + def __del__(self): + for name in self._TMP_FILE_NAMES: + try: + os.remove(self._TMP_FILES[name].name) + except: + pass + + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): + """ + Downloads webpage (if needed) and executes JS + + Params: + url: website url + html: optional, html code of website + video_id: video id + note: optional, displayed when downloading webpage + note2: optional, displayed when executing JS + headers: custom http headers + jscode: code to be executed when page is loaded + + Returns tuple with: + * downloaded website (after JS execution) + * anything you print with `console.log` (but not inside `page.execute`!) + + In most cases you don't need to add any `jscode`. + It is executed in `page.onLoadFinished`. + `saveAndExit();` is mandatory, use it instead of `phantom.exit()` + It is possible to wait for some element on the webpage, for example: + var check = function() { + var elementFound = page.evaluate(function() { + return document.querySelector('#b.done') !== null; + }); + if(elementFound) + saveAndExit(); + else + window.setTimeout(check, 500); + } + + page.evaluate(function(){ + document.querySelector('#a').click(); + }); + check(); + """ + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + if not html: + html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) + with open(self._TMP_FILES['html'].name, 'wb') as f: + f.write(html.encode('utf-8')) + + replaces = self.options + replaces['url'] = url + user_agent = headers.get('User-Agent') or std_headers['User-Agent'] + replaces['ua'] = user_agent.replace('"', '\\"') + replaces['jscode'] = jscode + + for x in self._TMP_FILE_NAMES: + replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + + with open(self._TMP_FILES['script'].name, 'wb') as f: + f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) + + if video_id is None: + self.extractor.to_screen('%s' % (note2,)) + else: + self.extractor.to_screen('%s: %s' % (video_id, note2)) + + p = subprocess.Popen([self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Executing JS failed\n:' + + encodeArgument(err)) + with open(self._TMP_FILES['html'].name, 'rb') as f: + html = f.read().decode('utf-8') + return (html, encodeArgument(out)) + From 40e41780f1d770a355f01e3c1e6fb09ff392f97e Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Tue, 25 Apr 2017 15:12:54 +0200 Subject: [PATCH 003/238] [phantomjs] add cookie support --- youtube_dl/extractor/common.py | 8 +++-- youtube_dl/utils.py | 62 +++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dcc9d628a..e54adc9f0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2343,10 +2343,12 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res - def _set_cookie(self, domain, name, value, expire_time=None): + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, None, None, domain, None, - None, '/', True, False, expire_time, '', None, None, None) + 0, name, value, port, not port is None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 94e1b07a6..9c94b7ec9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3654,6 +3654,37 @@ def write_xattr(path, key, value): "or the 'xattr' binary.") +def cookie_to_dict(cookie): + cookie_dict = { + 'name': cookie.name, + 'value': cookie.value, + }; + if cookie.port_specified: + cookie_dict['port'] = cookie.port + if cookie.domain_specified: + cookie_dict['domain'] = cookie.domain + if cookie.path_specified: + cookie_dict['path'] = cookie.path + if not cookie.expires is None: + cookie_dict['expires'] = cookie.expires + if not cookie.secure is None: + cookie_dict['secure'] = cookie.secure + if not cookie.discard is None: + cookie_dict['discard'] = cookie.discard + try: + if (cookie.has_nonstandard_attr('httpOnly') or + cookie.has_nonstandard_attr('httponly') or + cookie.has_nonstandard_attr('HttpOnly')): + cookie_dict['httponly'] = True + except TypeError: + pass + return cookie_dict + + +def cookie_jar_to_list(cookie_jar): + return [cookie_to_dict(cookie) for cookie in cookie_jar] + + class PhantomJSwrapper(object): """PhantomJS wrapper class""" @@ -3674,6 +3705,9 @@ class PhantomJSwrapper(object): var fs = require('fs'); var read = {{ mode: 'r', charset: 'utf-8' }}; var write = {{ mode: 'w', charset: 'utf-8' }}; + JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ + phantom.addCookie(x); + }}); page.settings.resourceTimeout = {timeout}; page.settings.userAgent = "{ua}"; page.onLoadStarted = function() {{ @@ -3684,6 +3718,7 @@ class PhantomJSwrapper(object): }}; var saveAndExit = function() {{ fs.write("{html}", page.content, write); + fs.write("{cookies}", JSON.stringify(phantom.cookies), write); phantom.exit(); }}; page.onLoadFinished = function(status) {{ @@ -3697,7 +3732,7 @@ class PhantomJSwrapper(object): page.open(""); ''' - _TMP_FILE_NAMES = ['script', 'html'] + _TMP_FILE_NAMES = ['script', 'html', 'cookies'] def __init__(self, extractor, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) @@ -3722,6 +3757,26 @@ class PhantomJSwrapper(object): except: pass + def _save_cookies(self, url): + cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) + for cookie in cookies: + if 'path' not in cookie: + cookie['path'] = '/' + if 'domain' not in cookie: + cookie['domain'] = compat_urlparse.urlparse(url).netloc + with open(self._TMP_FILES['cookies'].name, 'wb') as f: + f.write(json.dumps(cookies).encode('utf-8')) + + def _load_cookies(self): + with open(self._TMP_FILES['cookies'].name, 'rb') as f: + cookies = json.loads(f.read().decode('utf-8')) + for cookie in cookies: + if cookie['httponly'] is True: + cookie['rest'] = { 'httpOnly': None } + if 'expiry' in cookie: + cookie['expire_time'] = cookie['expiry'] + self.extractor._set_cookie(**cookie) + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ Downloads webpage (if needed) and executes JS @@ -3765,6 +3820,8 @@ class PhantomJSwrapper(object): with open(self._TMP_FILES['html'].name, 'wb') as f: f.write(html.encode('utf-8')) + self._save_cookies(url) + replaces = self.options replaces['url'] = url user_agent = headers.get('User-Agent') or std_headers['User-Agent'] @@ -3791,5 +3848,8 @@ class PhantomJSwrapper(object): + encodeArgument(err)) with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') + + self._load_cookies() + return (html, encodeArgument(out)) From fcace2d1adac5d1f306b22219fde3a4542bcd719 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 10:30:45 +0200 Subject: [PATCH 004/238] [openload] raise `not found` before executing js --- youtube_dl/extractor/openload.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index ac5e0bb08..0adf17765 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -74,12 +74,14 @@ class OpenloadIE(InfoExtractor): 'User-Agent': self._USER_AGENT, } - phantom = PhantomJSwrapper(self) - webpage, _ = phantom.get(url, video_id=video_id, headers=headers) + webpage = self._download_webpage(url, video_id, headers=headers) if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) + phantom = PhantomJSwrapper(self) + webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) + decoded_id = get_element_by_id('streamurl', webpage) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From 98f9d873814da2a8584cc30c0e197c15ed249db3 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 12:41:42 +0200 Subject: [PATCH 005/238] [phantomjs] Add required version checking --- youtube_dl/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9c94b7ec9..84aaac664 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3734,13 +3734,22 @@ class PhantomJSwrapper(object): _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - def __init__(self, extractor, timeout=10000): + def __init__(self, extractor, required_version=None, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: raise ExtractorError('PhantomJS executable not found in PATH, ' 'download it from http://phantomjs.org', expected=True) + self.extractor = extractor + + if required_version: + version = get_exe_version(self.exe, version_re=r'([0-9.]+)') + if is_outdated_version(version, required_version): + self.extractor._downloader.report_warning( + 'Your copy of PhantomJS is outdated, update it to version ' + '%s or newer if you encounter any errors.' % required_version) + self.options = { 'timeout': timeout, } From 7552f96352f35cd877e52fd0770b77ba1856fc62 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 12:41:57 +0200 Subject: [PATCH 006/238] [openload] Add required version --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 0adf17765..292476ef8 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -79,7 +79,7 @@ class OpenloadIE(InfoExtractor): if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) - phantom = PhantomJSwrapper(self) + phantom = PhantomJSwrapper(self, required_version='2.0') webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) decoded_id = get_element_by_id('streamurl', webpage) From 5ff1bc0cc10bf3006834c4b49fc36d733c83ce5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 28 Apr 2017 22:25:20 +0100 Subject: [PATCH 007/238] [YoutubeDL] write raw subtitle files --- youtube_dl/YoutubeDL.py | 43 +++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb465c425..c7100bb91 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1696,29 +1696,30 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - if sub_info.get('data') is not None: - sub_data = sub_info['data'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: - try: - sub_data = ie._download_webpage( - sub_info['url'], info_dict['id'], note=False) - except ExtractorError as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err.cause))) - continue - try: - sub_filename = subtitles_filename(filename, sub_lang, sub_format) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - # Use newline='' to prevent conversion of newline characters - # See https://github.com/rg3/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_data) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return + try: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) + except (ExtractorError, IOError, OSError, ValueError) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, error_to_compat_str(err))) + continue if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) From 24e966e8dab954136dabbc497064ac63b252495b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 28 Jul 2017 12:13:19 +0200 Subject: [PATCH 008/238] [megaphone] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 8 +++++ youtube_dl/extractor/megaphone.py | 55 ++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 youtube_dl/extractor/megaphone.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2513f2587..668248648 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -558,6 +558,7 @@ from .matchtv import MatchTVIE from .mdr import MDRIE from .mediaset import MediasetIE from .medici import MediciIE +from .megaphone import MegaphoneIE from .meipai import MeipaiIE from .melonvod import MelonVODIE from .meta import METAIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 36c81eda9..9678c32c4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -97,6 +97,7 @@ from .washingtonpost import WashingtonPostIE from .wistia import WistiaIE from .mediaset import MediasetIE from .joj import JojIE +from .megaphone import MegaphoneIE class GenericIE(InfoExtractor): @@ -2790,6 +2791,13 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( joj_urls, video_id, video_title, ie=JojIE.ie_key()) + # Look for megaphone.fm embeds + mpfn_urls = MegaphoneIE._extract_urls(webpage) + if mpfn_urls: + return self.playlist_from_matches( + mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) + + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py new file mode 100644 index 000000000..60e3caf0d --- /dev/null +++ b/youtube_dl/extractor/megaphone.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class MegaphoneIE(InfoExtractor): + IE_NAME = 'megaphone.fm' + IE_DESC = 'megaphone.fm embedded players' + _VALID_URL = r'https://player\.megaphone\.fm/(?P[A-Z0-9]+)' + _TEST = { + 'url': 'https://player.megaphone.fm/GLT9749789991?"', + 'md5': '4816a0de523eb3e972dc0dda2c191f96', + 'info_dict': { + 'id': 'GLT9749789991', + 'ext': 'mp3', + 'title': '#97 What Kind Of Idiot Gets Phished?', + 'thumbnail': 're:^https://.*\.png.*$', + 'duration': 1776.26375, + 'author': 'Reply All', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_property('audio:title', webpage) + author = self._og_search_property('audio:artist', webpage) + thumbnail = self._og_search_thumbnail(webpage) + + episode_json = self._search_regex(r'(?s)var\s+episode\s*=\s*(\{.+?\});', webpage, 'episode JSON') + episode_data = self._parse_json(episode_json, video_id, js_to_json) + video_url = self._proto_relative_url(episode_data['mediaUrl'], 'https:') + + formats = [{ + 'url': video_url, + }] + + return { + 'id': video_id, + 'thumbnail': thumbnail, + 'title': title, + 'author': author, + 'duration': episode_data['duration'], + 'formats': formats, + } + + @classmethod + def _extract_urls(cls, webpage): + return [m[0] for m in re.findall( + r']*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)] From c5a49ff08413411174837f1034ef439b79ff774b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Jul 2017 15:02:41 +0700 Subject: [PATCH 009/238] [downloader/hls] Use redirect URL as manifest base (#13755) --- youtube_dl/downloader/hls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 0e29c8a2a..46308cf07 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -59,9 +59,9 @@ class HlsFD(FragmentFD): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) - manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read() - - s = manifest.decode('utf-8', 'ignore') + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + s = urlh.read().decode('utf-8', 'ignore') if not self.can_download(s, info_dict): if info_dict.get('extra_param_to_segment_url'): From cbbe66635f3c23316f04a6f56ad57e025bc47263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Jul 2017 15:10:19 +0700 Subject: [PATCH 010/238] [yandexdisk] Add extractor (closes #13755) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/yandexdisk.py | 115 +++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 youtube_dl/extractor/yandexdisk.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 668248648..852942e0d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1298,6 +1298,7 @@ from .yandexmusic import ( YandexMusicAlbumIE, YandexMusicPlaylistIE, ) +from .yandexdisk import YandexDiskIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .ynet import YnetIE diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py new file mode 100644 index 000000000..11729f0f7 --- /dev/null +++ b/youtube_dl/extractor/yandexdisk.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + urlencode_postdata, +) + + +class YandexDiskIE(InfoExtractor): + _VALID_URL = r'https?://yadi\.sk/i/(?P[^/?#&]+)' + + _TEST = { + 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', + 'md5': '33955d7ae052f15853dc41f35f17581c', + 'info_dict': { + 'id': 'VdOeDou8eZs6Y', + 'ext': 'mp4', + 'title': '4.mp4', + 'duration': 168.6, + 'uploader': 'y.botova', + 'uploader_id': '300043621', + 'view_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + status = self._download_webpage( + 'https://disk.yandex.com/auth/status', video_id, query={ + 'urlOrigin': url, + 'source': 'public', + 'md5': 'false', + }) + + sk = self._search_regex( + r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', + status, 'sk', group='value') + + webpage = self._download_webpage(url, video_id) + + models = self._parse_json( + self._search_regex( + r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s* Date: Sat, 29 Jul 2017 15:13:12 +0700 Subject: [PATCH 011/238] [extractor/generic] PEP 8 --- youtube_dl/extractor/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9678c32c4..4b83e861b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2797,7 +2797,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): From 2a7a82321135bc59364c91caddde4211f378785b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Jul 2017 15:25:32 +0700 Subject: [PATCH 012/238] [svtplay] Update API URL (closes #13767) --- youtube_dl/extractor/svt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 1b5afb73e..38a505f9c 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -181,7 +181,7 @@ class SVTPlayIE(SVTBaseIE): if video_id: data = self._download_json( - 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id) + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, video_id) info_dict = self._extract_video(data, video_id) if not info_dict.get('title'): info_dict['title'] = re.sub( From c04017519da74a375d6c1c95733d921e96d8ee82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Jul 2017 15:30:53 +0700 Subject: [PATCH 013/238] [svtplay] Use geo verification proxy for API request --- youtube_dl/extractor/svt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 38a505f9c..48bc4529e 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -181,7 +181,8 @@ class SVTPlayIE(SVTBaseIE): if video_id: data = self._download_json( - 'https://api.svt.se/videoplayer-api/video/%s' % video_id, video_id) + 'https://api.svt.se/videoplayer-api/video/%s' % video_id, + video_id, headers=self.geo_verification_headers()) info_dict = self._extract_video(data, video_id) if not info_dict.get('title'): info_dict['title'] = re.sub( From 836ef2648613f4ca565b319af4769c02e35f60f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 29 Jul 2017 18:41:42 +0700 Subject: [PATCH 014/238] [soundcloud:trackstation] Add extractor (closes #13733) --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/soundcloud.py | 141 ++++++++++++++++++----------- 2 files changed, 89 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 852942e0d..d2c5e8030 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -935,8 +935,9 @@ from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE, + SoundcloudTrackStationIE, SoundcloudPlaylistIE, - SoundcloudSearchIE + SoundcloudSearchIE, ) from .soundgasm import ( SoundgasmIE, diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 3f1a46bb2..2f1b2978c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -31,6 +31,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ + (?!stations/track) (?P[\w\d-]+)/ (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P[\w\d-]+)/? @@ -330,7 +331,63 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): } -class SoundcloudUserIE(SoundcloudPlaylistBaseIE): +class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE): + _API_BASE = 'https://api.soundcloud.com' + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + def _extract_playlist(self, base_url, playlist_id, playlist_title): + COMMON_QUERY = { + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + } + + query = COMMON_QUERY.copy() + query['offset'] = 0 + + next_href = base_url + '?' + compat_urllib_parse_urlencode(query) + + entries = [] + for i in itertools.count(): + response = self._download_json( + next_href, playlist_id, 'Downloading track page %s' % (i + 1)) + + collection = response['collection'] + if not collection: + break + + def resolve_permalink_url(candidates): + for cand in candidates: + if isinstance(cand, dict): + permalink_url = cand.get('permalink_url') + entry_id = self._extract_id(cand) + if permalink_url and permalink_url.startswith('http'): + return permalink_url, entry_id + + for e in collection: + permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) + if permalink_url: + entries.append(self.url_result(permalink_url, video_id=entry_id)) + + next_href = response.get('next_href') + if not next_href: + break + + parsed_next_href = compat_urlparse.urlparse(response['next_href']) + qs = compat_urlparse.parse_qs(parsed_next_href.query) + qs.update(COMMON_QUERY) + next_href = compat_urlparse.urlunparse( + parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': playlist_title, + 'entries': entries, + } + + +class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): _VALID_URL = r'''(?x) https?:// (?:(?:www|m)\.)?soundcloud\.com/ @@ -385,16 +442,13 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE): 'playlist_mincount': 1, }] - _API_BASE = 'https://api.soundcloud.com' - _API_V2_BASE = 'https://api-v2.soundcloud.com' - _BASE_URL_MAP = { - 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, - 'tracks': '%s/users/%%s/tracks' % _API_BASE, - 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, - 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, - 'likes': '%s/users/%%s/likes' % _API_V2_BASE, - 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, + 'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE, + 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, + 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE, } _TITLE_MAP = { @@ -416,57 +470,36 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE): resolv_url, uploader, 'Downloading user info') resource = mobj.group('rsrc') or 'all' - base_url = self._BASE_URL_MAP[resource] % user['id'] - COMMON_QUERY = { - 'limit': 50, - 'client_id': self._CLIENT_ID, - 'linked_partitioning': '1', - } + return self._extract_playlist( + self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']), + '%s (%s)' % (user['username'], self._TITLE_MAP[resource])) - query = COMMON_QUERY.copy() - query['offset'] = 0 - next_href = base_url + '?' + compat_urllib_parse_urlencode(query) +class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)' + IE_NAME = 'soundcloud:trackstation' + _TESTS = [{ + 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text', + 'info_dict': { + 'id': '286017854', + 'title': 'Track station: your-text', + }, + 'playlist_mincount': 47, + }] - entries = [] - for i in itertools.count(): - response = self._download_json( - next_href, uploader, 'Downloading track page %s' % (i + 1)) + def _real_extract(self, url): + track_name = self._match_id(url) - collection = response['collection'] - if not collection: - break + webpage = self._download_webpage(url, track_name) - def resolve_permalink_url(candidates): - for cand in candidates: - if isinstance(cand, dict): - permalink_url = cand.get('permalink_url') - entry_id = self._extract_id(cand) - if permalink_url and permalink_url.startswith('http'): - return permalink_url, entry_id + track_id = self._search_regex( + r'soundcloud:track-stations:(\d+)', webpage, 'track id') - for e in collection: - permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) - if permalink_url: - entries.append(self.url_result(permalink_url, video_id=entry_id)) - - next_href = response.get('next_href') - if not next_href: - break - - parsed_next_href = compat_urlparse.urlparse(response['next_href']) - qs = compat_urlparse.parse_qs(parsed_next_href.query) - qs.update(COMMON_QUERY) - next_href = compat_urlparse.urlunparse( - parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True))) - - return { - '_type': 'playlist', - 'id': compat_str(user['id']), - 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), - 'entries': entries, - } + return self._extract_playlist( + '%s/stations/soundcloud:track-stations:%s/tracks' + % (self._API_V2_BASE, track_id), + track_id, 'Track station: %s' % track_name) class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): From e445850e69990502b171765343fc38317e932aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 29 Jul 2017 18:45:57 +0700 Subject: [PATCH 015/238] [soundcloud] Update client id --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 2f1b2978c..2e52e092b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -122,7 +122,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z' + _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod From ca127ab2c174cdee4428eb4e192393c6ca942ac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 29 Jul 2017 23:07:28 +0700 Subject: [PATCH 016/238] [ard] Add support for lives (closes #13771) --- youtube_dl/extractor/ard.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2d5599456..3f248b147 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -93,6 +93,7 @@ class ARDMediathekIE(InfoExtractor): duration = int_or_none(media_info.get('_duration')) thumbnail = media_info.get('_previewImage') + is_live = media_info.get('_isLive') is True subtitles = {} subtitle_url = media_info.get('_subtitleUrl') @@ -106,6 +107,7 @@ class ARDMediathekIE(InfoExtractor): 'id': video_id, 'duration': duration, 'thumbnail': thumbnail, + 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, } @@ -166,9 +168,11 @@ class ARDMediathekIE(InfoExtractor): # determine video id from url m = re.match(self._VALID_URL, url) + document_id = None + numid = re.search(r'documentId=([0-9]+)', url) if numid: - video_id = numid.group(1) + document_id = video_id = numid.group(1) else: video_id = m.group('video_id') @@ -228,12 +232,16 @@ class ARDMediathekIE(InfoExtractor): 'formats': formats, } else: # request JSON file + if not document_id: + video_id = self._search_regex( + r'/play/(?:config|media)/(\d+)', webpage, 'media id') info = self._extract_media_info( - 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) + 'http://www.ardmediathek.de/play/media/%s' % video_id, + webpage, video_id) info.update({ 'id': video_id, - 'title': title, + 'title': self._live_title(title) if info.get('is_live') else title, 'description': description, 'thumbnail': thumbnail, }) From 198d4cb40ce9d819e8e4079058642ee96dae213b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grzegorz=20Ruci=C5=84ski?= <grucin@gmail.com> Date: Sat, 29 Jul 2017 20:30:04 +0200 Subject: [PATCH 017/238] [generic] Add support for another ooyala embed pattern (closes #13727) --- youtube_dl/extractor/generic.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4b83e861b..34e814988 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -575,6 +575,19 @@ class GenericIE(InfoExtractor): }, 'skip': 'movie expired', }, + # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js + { + 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', + 'info_dict': { + 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', + 'ext': 'mp4', + 'title': 'Steampunk Fest Comes to Honesdale', + 'duration': 43.276, + }, + 'params': { + 'skip_download': True, + } + }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -2293,6 +2306,7 @@ class GenericIE(InfoExtractor): # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or + re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: From a0a477b885dc1dd688058924357c4935f3c935cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 30 Jul 2017 15:48:22 +0700 Subject: [PATCH 018/238] [youjizz] Fix extraction (closes #13744) --- youtube_dl/extractor/youjizz.py | 80 ++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index b50f34e9b..f33fabe19 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,39 +1,95 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, +) class YouJizzIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])' + _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))' _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': '78fc1901148284c69af12640e01c6310', + 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4', 'info_dict': { 'id': '2189178', 'ext': 'mp4', 'title': 'Zeichentrick 1', 'age_limit': 18, + 'duration': 2874, } }, { 'url': 'http://www.youjizz.com/videos/-2189178.html', 'only_matching': True, + }, { + 'url': 'https://www.youjizz.com/videos/embed/31991001', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - # YouJizz's HTML5 player has invalid HTML - webpage = webpage.replace('"controls', '" controls') - age_limit = self._rta_search(webpage) - video_title = self._html_search_regex( - r'<title>\s*(.*)\s*', webpage, 'title') + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('embed_id') - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r'(.+?)', webpage, 'title') + + formats = [] + + encodings = self._parse_json( + self._search_regex( + r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', + default='[]'), + video_id, fatal=False) + for encoding in encodings: + if not isinstance(encoding, dict): + continue + format_url = encoding.get('filename') + if not isinstance(format_url, compat_str): + continue + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + format_id = encoding.get('name') or encoding.get('quality') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': height, + }) + + if formats: + info_dict = { + 'formats': formats, + } + else: + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') + info_dict = self._parse_html5_media_entries( + url, webpage, video_id)[0] + + duration = parse_duration(self._search_regex( + r'Runtime:([^<]+)', webpage, 'duration', + default=None)) + uploader = self._search_regex( + r'Uploaded By:.*?]*>([^<]+)', webpage, 'uploader', + default=None) info_dict.update({ 'id': video_id, - 'title': video_title, - 'age_limit': age_limit, + 'title': title, + 'age_limit': self._rta_search(webpage), + 'duration': duration, + 'uploader': uploader, }) return info_dict From 0ed4758023ddfb4d9630ba9114ef70ef7e6ac09d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 Jul 2017 19:08:44 +0700 Subject: [PATCH 019/238] [clipfish] Remove extractor --- youtube_dl/extractor/clipfish.py | 67 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 68 deletions(-) delete mode 100644 youtube_dl/extractor/clipfish.py diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py deleted file mode 100644 index 0920f6219..000000000 --- a/youtube_dl/extractor/clipfish.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) - - -class ClipfishIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P[0-9]+)' - _TEST = { - 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', - 'md5': 'b9a5dc46294154c1193e2d10e0c95693', - 'info_dict': { - 'id': '4343170', - 'ext': 'mp4', - 'title': 'S01 E01 - Ugly Americans - Date in der Hölle', - 'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.', - 'upload_date': '20161005', - 'duration': 1291, - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_info = self._download_json( - 'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id, - video_id)['items'][0] - - formats = [] - - m3u8_url = video_info.get('media_videourl_hls') - if m3u8_url: - formats.append({ - 'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), - 'ext': 'mp4', - 'format_id': 'hls', - }) - - mp4_url = video_info.get('media_videourl') - if mp4_url: - formats.append({ - 'url': mp4_url, - 'format_id': 'mp4', - 'width': int_or_none(video_info.get('width')), - 'height': int_or_none(video_info.get('height')), - 'tbr': int_or_none(video_info.get('bitrate')), - }) - - descr = video_info.get('descr') - if descr: - descr = descr.strip() - - return { - 'id': video_id, - 'title': video_info['title'], - 'description': descr, - 'formats': formats, - 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), - 'duration': int_or_none(video_info.get('media_length')), - 'upload_date': unified_strdate(video_info.get('pubDate')), - 'view_count': int_or_none(video_info.get('media_views')) - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d2c5e8030..bdc7370cd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -186,7 +186,6 @@ from .chirbit import ( ) from .cinchcast import CinchcastIE from .cjsw import CJSWIE -from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE From 8b9f50d7cb4cfab5d505f4233c3e176a8106d6db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 Jul 2017 19:09:44 +0700 Subject: [PATCH 020/238] [watchbox] Add extractor (#13739) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/watchbox.py | 151 +++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 youtube_dl/extractor/watchbox.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bdc7370cd..3489e86f0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1244,6 +1244,7 @@ from .washingtonpost import ( WashingtonPostArticleIE, ) from .wat import WatIE +from .watchbox import WatchBoxIE from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py new file mode 100644 index 000000000..b382338fa --- /dev/null +++ b/youtube_dl/extractor/watchbox.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + strip_or_none, + try_get, + unified_timestamp, +) + + +class WatchBoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?Pserien|filme)/(?:[^/]+/)*[^/]+-(?P\d+)' + _TESTS = [{ + # film + 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html', + 'info_dict': { + 'id': '341368', + 'ext': 'mp4', + 'title': 'Free Jimmy', + 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 4890, + 'age_limit': 16, + 'release_year': 2009, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + # episode + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html', + 'info_dict': { + 'id': '328286', + 'ext': 'mp4', + 'title': 'S01 E01 - Date in der Hölle', + 'description': 'md5:2f31c74a8186899f33cb5114491dae2b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1291, + 'age_limit': 12, + 'release_year': 2010, + 'series': 'Ugly Americans', + 'season_number': 1, + 'episode': 'Date in der Hölle', + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id = mobj.group('kind', 'id') + + webpage = self._download_webpage(url, video_id) + + source = self._parse_json( + self._search_regex( + r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) or {} + + video_id = compat_str(source.get('videoId') or video_id) + + devapi = self._download_json( + 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={ + 'format': 'json', + 'apikey': 'hbbtv', + }, fatal=False) + + item = try_get(devapi, lambda x: x['items'][0], dict) or {} + + title = item.get('title') or try_get( + item, lambda x: x['movie']['headline_movie'], + compat_str) or source['title'] + + formats = [] + hls_url = item.get('media_videourl_hls') or source.get('hls') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + dash_url = item.get('media_videourl_wv') or source.get('dash') + if dash_url: + formats.extend(self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', fatal=False)) + mp4_url = item.get('media_videourl') + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + 'tbr': int_or_none(item.get('bitrate')), + }) + self._sort_formats(formats) + + description = strip_or_none(item.get('descr')) + thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') + duration = int_or_none(item.get('media_length') or source.get('length')) + timestamp = unified_timestamp(item.get('pubDate')) + view_count = int_or_none(item.get('media_views')) + age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk'])) + release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year'])) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'age_limit': age_limit, + 'release_year': release_year, + 'formats': formats, + } + + if kind.lower() == 'serien': + series = try_get( + item, lambda x: x['special']['title'], + compat_str) or source.get('format') + season_number = int_or_none(self._search_regex( + r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number', + default=None) or self._search_regex( + r'/staffel-(\d+)/', url, 'season number', default=None)) + episode = source.get('title') + episode_number = int_or_none(self._search_regex( + r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number', + default=None)) + info.update({ + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info From f701827e319963ca783d012f3647aa44fc0efcd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 Jul 2017 19:43:09 +0700 Subject: [PATCH 021/238] [ChangeLog] Actualize --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index 8e63b5c11..ca3ee8a93 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,28 @@ version Core +* [downloader/hls] Use redirect URL as manifest base (#13755) * [options] Correctly hide login info from debug outputs (#13696) +Extractors ++ [watchbox] Add support for watchbox.de (#13739) +- [clipfish] Remove extractor ++ [youjizz] Fix extraction (#13744) ++ [generic] Add support for another ooyala embed pattern (#13727) ++ [ard] Add support for lives (#13771) +* [soundcloud] Update client id ++ [soundcloud:trackstation] Add support for track stations (#13733) +* [svtplay] Use geo verification proxy for API request +* [svtplay] Update API URL (#13767) ++ [yandexdisk] Add support for yadi.sk (#13755) ++ [megaphone] Add support for megaphone.fm +* [amcnetworks] Make rating optional (#12453) +* [cloudy] Fix extraction (#13737) ++ [nickru] Add support for nickelodeon.ru +* [mtv] Improve thumbnal extraction +* [nick] Automate geo-restriction bypass (#13711) +* [niconico] Improve error reporting (#13696) + version 2017.07.23 From 5c9ea67bc0dedd15a0ed9ad05d8fcf09946ca461 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 Jul 2017 20:47:31 +0700 Subject: [PATCH 022/238] release 2017.07.30.1 --- .github/ISSUE_TEMPLATE.md | 16 ++++++++-------- ChangeLog | 2 +- docs/supportedsites.md | 6 +++++- youtube_dl/version.py | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 37d09d796..0421de755 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,16 +1,16 @@ ## Please follow the guide below - You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly -- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x]) -- Use *Preview* tab to see how your issue will actually look like +- Put an `x` into all the boxes [ ] relevant to your *issue* (like this: `[x]`) +- Use the *Preview* tab to see what your issue will actually look like --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.07.23*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.07.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.07.30.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.07.30.1** ### Before submitting an *issue* make sure you have: -- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections +- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones ### What is the purpose of your *issue*? @@ -28,14 +28,14 @@ ### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: -Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): +Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl -v `), copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): + ``` -$ youtube-dl -v [debug] System config: [] [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.07.23 +[debug] youtube-dl version 2017.07.30.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ca3ee8a93..4f03ef064 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.07.30.1 Core * [downloader/hls] Use redirect URL as manifest base (#13755) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index eb09c470c..77aac8249 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -155,7 +155,6 @@ - **chirbit:profile** - **Cinchcast** - **CJSW** - - **Clipfish** - **cliphunter** - **ClipRs** - **Clipsyndicate** @@ -440,6 +439,7 @@ - **Medialaan** - **Mediaset** - **Medici** + - **megaphone.fm**: megaphone.fm embedded players - **Meipai**: 美拍 - **MelonVOD** - **META** @@ -533,6 +533,7 @@ - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** + - **nickelodeonru** - **nicknight** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** @@ -734,6 +735,7 @@ - **soundcloud:playlist** - **soundcloud:search**: Soundcloud search - **soundcloud:set** + - **soundcloud:trackstation** - **soundcloud:user** - **soundgasm** - **soundgasm:profile** @@ -968,6 +970,7 @@ - **washingtonpost** - **washingtonpost:article** - **wat.tv** + - **WatchBox** - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** @@ -1003,6 +1006,7 @@ - **XVideos** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies + - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a8dbb93e3..38162157d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.07.23' +__version__ = '2017.07.30.1' From 9118c9f18a43a2f3e4814fcc02ac8e5180077df3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Aug 2017 05:20:14 +0700 Subject: [PATCH 023/238] [nrktv] Update API host (closes #13796) --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3b4f51f61..18ead9426 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -237,7 +237,7 @@ class NRKTVIE(NRKBaseIE): (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P\d+))? ''' % _EPISODE_RE - _API_HOST = 'psapi-we.nrk.no' + _API_HOST = 'psapi-ne.nrk.no' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', From 8cda78ef72c52c0424ddf90c22105dbc3b1d16f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 2 Aug 2017 23:12:34 +0700 Subject: [PATCH 024/238] [test_YoutubeDL] Add a test for #10083 --- test/test_YoutubeDL.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 4af14f9db..e70cbcd37 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -371,6 +371,19 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL({'format': 'best[height>360]'}) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_format_selection_issue_10083(self): + # See https://github.com/rg3/youtube-dl/issues/10083 + formats = [ + {'format_id': 'regular', 'height': 360, 'url': TEST_URL}, + {'format_id': 'video', 'height': 720, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio', 'vcodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'best[height>360]/bestvideo[height>360]+bestaudio'}) + ydl.process_ie_result(info_dict.copy()) + self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'video+audio') + def test_invalid_format_specs(self): def assert_syntax_error(format_spec): ydl = YDL({'format': format_spec}) From feee8d32e45c9521426cf4a089c70f37542f0065 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 3 Aug 2017 14:17:25 +0200 Subject: [PATCH 025/238] [phantomjs] add exe version to debug info --- youtube_dl/YoutubeDL.py | 2 ++ youtube_dl/utils.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb465c425..033b50702 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -86,6 +86,7 @@ from .utils import ( write_string, YoutubeDLCookieProcessor, YoutubeDLHandler, + PhantomJSwrapper, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -2146,6 +2147,7 @@ class YoutubeDL(object): exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( '%s %s' % (exe, v) for exe, v in sorted(exe_versions.items()) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c67f95ac9..4d0685d83 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3895,6 +3895,10 @@ class PhantomJSwrapper(object): _TMP_FILE_NAMES = ['script', 'html', 'cookies'] + @staticmethod + def _version(): + return get_exe_version('phantomjs', version_re=r'([0-9.]+)') + def __init__(self, extractor, required_version=None, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: @@ -3905,7 +3909,7 @@ class PhantomJSwrapper(object): self.extractor = extractor if required_version: - version = get_exe_version(self.exe, version_re=r'([0-9.]+)') + version = self._version() if is_outdated_version(version, required_version): self.extractor._downloader.report_warning( 'Your copy of PhantomJS is outdated, update it to version ' From 183062a4ab2f698f5096e69602fb2b5c861c01a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 3 Aug 2017 23:19:59 +0700 Subject: [PATCH 026/238] [pbs] Add support for new URL schema (closes #13801) --- youtube_dl/extractor/pbs.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 16cc667d0..8889e4a1a 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -189,7 +189,7 @@ class PBSIE(InfoExtractor): # Direct video URL (?:%s)/(?:viralplayer|video)/(?P[0-9]+)/? | # Article with embedded player (or direct video) - (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P[^/]+?)(?:\.html)?/?(?:$|[?\#]) | + (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P[^/]+)/ ) @@ -345,6 +345,21 @@ class PBSIE(InfoExtractor): 'formats': 'mincount:8', }, }, + { + # https://github.com/rg3/youtube-dl/issues/13801 + 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/', + 'info_dict': { + 'id': '3003333873', + 'ext': 'mp4', + 'title': 'PBS NewsHour - full episode July 31, 2017', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 3265, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -433,6 +448,9 @@ class PBSIE(InfoExtractor): if url: break + if not url: + url = self._og_search_url(webpage) + mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') From 1f03fef994e076a827cfd818eb4d76fe2eb85130 Mon Sep 17 00:00:00 2001 From: Justin Quan Date: Fri, 4 Aug 2017 08:43:44 -0700 Subject: [PATCH 027/238] [README.md] Improve grammar --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fe2bebc2a..0067184be 100644 --- a/README.md +++ b/README.md @@ -584,7 +584,7 @@ If you are using an output template inside a Windows batch file then you must es #### Output template examples -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of single. ```bash $ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc @@ -671,7 +671,7 @@ If you want to preserve the old format selection behavior (prior to youtube-dl 2 #### Format selection examples -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of single. ```bash # Download best mp4 format available or any other best if no mp4 available From 11a6793f8013f37045d769e5b166f75e17f275d1 Mon Sep 17 00:00:00 2001 From: Matt Crupi Date: Fri, 4 Aug 2017 08:46:54 -0700 Subject: [PATCH 028/238] [mlb] Extend _VALID_URL (closes #13740) --- youtube_dl/extractor/mlb.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 59cd4b838..4d45f960e 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -15,7 +15,7 @@ class MLBIE(InfoExtractor): (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v| + (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| @@ -94,6 +94,10 @@ class MLBIE(InfoExtractor): 'upload_date': '20150415', } }, + { + 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', + 'only_matching': True, + }, { 'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb', 'only_matching': True, From 57a38a38c32ea2eb1ca54ee4ba3fcd31a9b7f328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 4 Aug 2017 23:44:07 +0700 Subject: [PATCH 029/238] [udemy] Fix subtitles extraction (closes #13812) --- youtube_dl/extractor/udemy.py | 47 ++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 160be1b1b..3b02f43e3 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -15,6 +15,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + js_to_json, sanitized_Request, unescapeHTML, urlencode_postdata, @@ -268,6 +269,25 @@ class UdemyIE(InfoExtractor): f = add_output_format_meta(f, format_id) formats.append(f) + def extract_subtitles(track_list): + if not isinstance(track_list, list): + return + for track in track_list: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + src = track.get('src') + if not src or not isinstance(src, compat_str): + continue + lang = track.get('language') or track.get( + 'srclang') or track.get('label') + sub_dict = automatic_captions if track.get( + 'autogenerated') is True else subtitles + sub_dict.setdefault(lang, []).append({ + 'url': src, + }) + download_urls = asset.get('download_urls') if isinstance(download_urls, dict): extract_formats(download_urls.get('Video')) @@ -315,23 +335,16 @@ class UdemyIE(InfoExtractor): extract_formats(data.get('sources')) if not duration: duration = int_or_none(data.get('duration')) - tracks = data.get('tracks') - if isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - if track.get('kind') != 'captions': - continue - src = track.get('src') - if not src or not isinstance(src, compat_str): - continue - lang = track.get('language') or track.get( - 'srclang') or track.get('label') - sub_dict = automatic_captions if track.get( - 'autogenerated') is True else subtitles - sub_dict.setdefault(lang, []).append({ - 'url': src, - }) + extract_subtitles(data.get('tracks')) + + if not subtitles and not automatic_captions: + text_tracks = self._parse_json( + self._search_regex( + r'text-tracks=(["\'])(?P\[.+?\])\1', view_html, + 'text tracks', default='{}', group='data'), video_id, + transform_source=lambda s: js_to_json(unescapeHTML(s)), + fatal=False) + extract_subtitles(text_tracks) self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) From b3b5870cba46d84b7482f120f550822d3b64c3d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 4 Aug 2017 23:51:03 +0700 Subject: [PATCH 030/238] [pornhd] Fix extraction (closes #13783) --- youtube_dl/extractor/pornhd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 842317e6c..36761788d 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor): r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", + r"(?s)sources'?\s*:\s*(\{.+?\})\s*\}[;,)]", webpage, 'sources', default='{}')), video_id) if not sources: From 799802f368012f579750b26db117b3a9dfdcbe05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 4 Aug 2017 23:54:28 +0700 Subject: [PATCH 031/238] [teamfour] Remove extractor (closes #13782) Now covered with generic extractor --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/teamfourstar.py | 48 ---------------------------- 2 files changed, 49 deletions(-) delete mode 100644 youtube_dl/extractor/teamfourstar.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3489e86f0..d0e04dd7d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -994,7 +994,6 @@ from .teachertube import ( ) from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE -from .teamfourstar import TeamFourStarIE from .techtalks import TechTalksIE from .ted import TEDIE from .tele13 import Tele13IE diff --git a/youtube_dl/extractor/teamfourstar.py b/youtube_dl/extractor/teamfourstar.py deleted file mode 100644 index a8c6ed7be..000000000 --- a/youtube_dl/extractor/teamfourstar.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from .jwplatform import JWPlatformIE -from ..utils import unified_strdate - - -class TeamFourStarIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P[a-z0-9\-]+)' - _TEST = { - 'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/', - 'info_dict': { - 'id': '0WdZO31W', - 'title': 'TFS Abridged Parody Episode 1', - 'description': 'md5:d60bc389588ebab2ee7ad432bda953ae', - 'ext': 'mp4', - 'timestamp': 1394168400, - 'upload_date': '20080508', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - jwplatform_url = JWPlatformIE._extract_url(webpage) - - video_title = self._html_search_regex( - r']+class="entry-title"[^>]*>(?P.+?)</h1>', - webpage, 'title') - video_date = unified_strdate(self._html_search_regex( - r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>', - webpage, 'date', fatal=False)) - video_description = self._html_search_regex( - r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>', - webpage, 'description', fatal=False) - video_thumbnail = self._og_search_thumbnail(webpage) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - 'url': jwplatform_url, - } From f31fd0693b674e73f9273f0afba2a54853e4ca35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 Aug 2017 00:00:21 +0700 Subject: [PATCH 032/238] [vidme] Extract DASH and HLS formats --- youtube_dl/extractor/vidme.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index e9ff336c4..a7971d72e 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import itertools from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( ExtractorError, int_or_none, @@ -161,13 +164,28 @@ class VidmeIE(InfoExtractor): 'or for violating the terms of use.', expected=True) - formats = [{ - 'format_id': f.get('type'), - 'url': f['uri'], - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'preference': 0 if f.get('type', '').endswith('clip') else 1, - } for f in video.get('formats', []) if f.get('uri')] + formats = [] + for f in video.get('formats', []): + format_url = f.get('uri') + if not format_url or not isinstance(format_url, compat_str): + continue + format_type = f.get('type') + if format_type == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif format_type == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': f.get('type'), + 'url': format_url, + 'width': int_or_none(f.get('width')), + 'height': int_or_none(f.get('height')), + 'preference': 0 if f.get('type', '').endswith( + 'clip') else 1, + }) if not formats and video.get('complete_url'): formats.append({ From bbbe1cebfce3cfb63e9c01d29105fcba693ba54c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 Aug 2017 00:09:36 +0700 Subject: [PATCH 033/238] [mlb] Update test (closes #13777) --- youtube_dl/extractor/mlb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 4d45f960e..675ff6873 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -84,7 +84,7 @@ class MLBIE(InfoExtractor): }, { 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'b190e70141fb9a1552a85426b4da1b5d', + 'md5': 'aafaf5b0186fee8f32f20508092f8111', 'info_dict': { 'id': '75609783', 'ext': 'mp4', From 8519b88f67de9c0c11cd2edd8dc55b9a4f13d110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 Aug 2017 00:59:07 +0700 Subject: [PATCH 034/238] [yandexdisk] Relax _VALID_URL (closes #13824) --- youtube_dl/extractor/yandexdisk.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py index 11729f0f7..e8f6ae10f 100644 --- a/youtube_dl/extractor/yandexdisk.py +++ b/youtube_dl/extractor/yandexdisk.py @@ -13,9 +13,9 @@ from ..utils import ( class YandexDiskIE(InfoExtractor): - _VALID_URL = r'https?://yadi\.sk/i/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', 'md5': '33955d7ae052f15853dc41f35f17581c', 'info_dict': { @@ -27,7 +27,10 @@ class YandexDiskIE(InfoExtractor): 'uploader_id': '300043621', 'view_count': int, }, - } + }, { + 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 1141e9104bc0f8d577f18cf28a1af58adea1248e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 Aug 2017 06:57:19 +0700 Subject: [PATCH 035/238] Use relative paths for DASH fragments (closes #12990) 10x reduced JSON size refs #13810 --- youtube_dl/downloader/dash.py | 14 ++++++++++---- youtube_dl/extractor/common.py | 16 ++++++++++------ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 7491fdad8..576ece6db 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .fragment import FragmentFD from ..compat import compat_urllib_error +from ..utils import urljoin class DashSegmentsFD(FragmentFD): @@ -12,12 +13,13 @@ class DashSegmentsFD(FragmentFD): FD_NAME = 'dashsegments' def real_download(self, filename, info_dict): - segments = info_dict['fragments'][:1] if self.params.get( + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] ctx = { 'filename': filename, - 'total_frags': len(segments), + 'total_frags': len(fragments), } self._prepare_and_start_frag_download(ctx) @@ -26,7 +28,7 @@ class DashSegmentsFD(FragmentFD): skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) frag_index = 0 - for i, segment in enumerate(segments): + for i, fragment in enumerate(fragments): frag_index += 1 if frag_index <= ctx['fragment_index']: continue @@ -36,7 +38,11 @@ class DashSegmentsFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) if not success: return False self._append_fragment(ctx, frag_content) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 748b4d59f..459e7ffd6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1892,9 +1892,13 @@ class InfoExtractor(object): 'Bandwidth': bandwidth, } + def location_key(location): + return 'url' if re.match(r'^https?://', location) else 'path' + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) + media_location_key = location_key(media_template) # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time @@ -1904,7 +1908,7 @@ class InfoExtractor(object): segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['fragments'] = [{ - 'url': media_template % { + media_location_key: media_template % { 'Number': segment_number, 'Bandwidth': bandwidth, }, @@ -1928,7 +1932,7 @@ class InfoExtractor(object): 'Number': segment_number, } representation_ms_info['fragments'].append({ - 'url': segment_url, + media_location_key: segment_url, 'duration': float_or_none(segment_d, representation_ms_info['timescale']), }) @@ -1952,8 +1956,9 @@ class InfoExtractor(object): for s in representation_ms_info['s']: duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): + segment_uri = representation_ms_info['segment_urls'][segment_index] fragments.append({ - 'url': representation_ms_info['segment_urls'][segment_index], + location_key(segment_uri): segment_uri, 'duration': duration, }) segment_index += 1 @@ -1962,6 +1967,7 @@ class InfoExtractor(object): # No fragments key is present in this case. if 'fragments' in representation_ms_info: f.update({ + 'fragment_base_url': base_url, 'fragments': [], 'protocol': 'http_dash_segments', }) @@ -1969,10 +1975,8 @@ class InfoExtractor(object): initialization_url = representation_ms_info['initialization_url'] if not f.get('url'): f['url'] = initialization_url - f['fragments'].append({'url': initialization_url}) + f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - for fragment in f['fragments']: - fragment['url'] = urljoin(base_url, fragment['url']) try: existing_format = next( fo for fo in formats From c983cc3b71e3b2c80df920481dfa90bbc2ad7937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 Aug 2017 08:17:01 +0700 Subject: [PATCH 036/238] [cinchcast] Extend _VALID_URL --- youtube_dl/extractor/cinchcast.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py index 562c9bbbb..b861d54b0 100644 --- a/youtube_dl/extractor/cinchcast.py +++ b/youtube_dl/extractor/cinchcast.py @@ -9,12 +9,20 @@ from ..utils import ( class CinchcastIE(InfoExtractor): - _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', + 'info_dict': { + 'id': '5258197', + 'ext': 'mp3', + 'title': 'Train Your Brain to Up Your Game with Coach Mandy', + 'upload_date': '20130816', + }, + }, { # Actual test is run in generic, look for undergroundwellness 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', 'only_matching': True, - } + }] def _real_extract(self, url): video_id = self._match_id(url) From 1d5472290f8c426c13e3403cb95fd44cc71b5a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 Aug 2017 08:28:12 +0700 Subject: [PATCH 037/238] [podomatic] Extend _VALID_URL (closes #13827) --- youtube_dl/extractor/podomatic.py | 63 +++++++++++++++++-------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py index f20946a2b..25fcebf9f 100644 --- a/youtube_dl/extractor/podomatic.py +++ b/youtube_dl/extractor/podomatic.py @@ -9,39 +9,46 @@ from ..utils import int_or_none class PodomaticIE(InfoExtractor): IE_NAME = 'podomatic' - _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)' + _VALID_URL = r'''(?x) + (?P<proto>https?):// + (?: + (?P<channel>[^.]+)\.podomatic\.com/entry| + (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes + )/ + (?P<id>[^/?#&]+) + ''' - _TESTS = [ - { - 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', - 'md5': '84bb855fcf3429e6bf72460e1eed782d', - 'info_dict': { - 'id': '2009-01-02T16_03_35-08_00', - 'ext': 'mp3', - 'uploader': 'Science Teaching Tips', - 'uploader_id': 'scienceteachingtips', - 'title': '64. When the Moon Hits Your Eye', - 'duration': 446, - } - }, - { - 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', - 'md5': 'd2cf443931b6148e27638650e2638297', - 'info_dict': { - 'id': '2013-11-15T16_31_21-08_00', - 'ext': 'mp3', - 'uploader': 'Ostbahnhof / Techno Mix', - 'uploader_id': 'ostbahnhof', - 'title': 'Einunddreizig', - 'duration': 3799, - } - }, - ] + _TESTS = [{ + 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00', + 'md5': '84bb855fcf3429e6bf72460e1eed782d', + 'info_dict': { + 'id': '2009-01-02T16_03_35-08_00', + 'ext': 'mp3', + 'uploader': 'Science Teaching Tips', + 'uploader_id': 'scienceteachingtips', + 'title': '64. When the Moon Hits Your Eye', + 'duration': 446, + } + }, { + 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00', + 'md5': 'd2cf443931b6148e27638650e2638297', + 'info_dict': { + 'id': '2013-11-15T16_31_21-08_00', + 'ext': 'mp3', + 'uploader': 'Ostbahnhof / Techno Mix', + 'uploader_id': 'ostbahnhof', + 'title': 'Einunddreizig', + 'duration': 3799, + } + }, { + 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - channel = mobj.group('channel') + channel = mobj.group('channel') or mobj.group('channel_2') json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' + '?permalink=true&rtmp=0') % From f172c86dcdb46e484afc63732db56df5633028ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 5 Aug 2017 21:17:55 +0700 Subject: [PATCH 038/238] [vlive:channel] Limit number of videos per page to 100 (closes #13830) --- youtube_dl/extractor/vlive.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 77c120a57..64d0224e6 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -236,7 +236,12 @@ class VLiveChannelIE(InfoExtractor): query={ 'app_id': app_id, 'channelSeq': channel_seq, - 'maxNumOfRows': 1000, + # Large values of maxNumOfRows (~300 or above) may cause + # empty responses (see [1]), e.g. this happens for [2] that + # has more than 300 videos. + # 1. https://github.com/rg3/youtube-dl/issues/13830 + # 2. http://channels.vlive.tv/EDBF. + 'maxNumOfRows': 100, '_': int(time.time()), 'pageNo': page_num } From daaaf5f5942252e9fbc367957bd3b8a96d0dd5bb Mon Sep 17 00:00:00 2001 From: Ashutosh Chaudhary <ashutosh.chaudhary@gmail.com> Date: Mon, 23 Jan 2017 05:12:52 +0530 Subject: [PATCH 039/238] [voot] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/voot.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/voot.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d0e04dd7d..48dda8b8e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1333,3 +1333,4 @@ from .zapiks import ZapiksIE from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE +from .voot import VootIE diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py new file mode 100644 index 000000000..db5bda660 --- /dev/null +++ b/youtube_dl/extractor/voot.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class VootIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/(?:.+?[/-]?)/1/(?:.+?[0-9]?)/(?:.+?[/-]?)/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', + 'info_dict': { + 'id': '441353', + 'ext': 'mp4', + 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + _GET_CONTENT_TEMPLATE = 'https://wapi.voot.com/ws/ott/getMediaInfo.json?platform=Web&pId=3&mediaId=%s' + + def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True): + json_data = super(VootIE, self)._download_json(url_or_request, video_id, note, fatal=fatal) + if json_data['status']['code'] != 0: + if fatal: + raise ExtractorError(json_data['status']['message']) + return None + return json_data['assets'] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + self._GET_CONTENT_TEMPLATE % video_id, + video_id) + + thumbnail = '' + formats = [] + + if video_data: + format_url = video_data.get('URL') + formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + + if video_data['Pictures']: + for picture in video_data['Pictures']: + #Get only first available thumbnail + thumbnail = picture.get('URL') + break + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data.get('MediaName'), + 'thumbnail': thumbnail, + 'formats':formats, + } From e2b4808fd8ed49424deaa6d800daf0950e55ffff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 6 Aug 2017 08:04:51 +0700 Subject: [PATCH 040/238] [voot] Improve extraction (#10255, closes #11814) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/voot.py | 111 ++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 48dda8b8e..ebe414dae 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1222,6 +1222,7 @@ from .vodlocker import VodlockerIE from .vodpl import VODPlIE from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE +from .voot import VootIE from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE @@ -1333,4 +1334,3 @@ from .zapiks import ZapiksIE from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE -from .voot import VootIE diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py index db5bda660..5de3deb8c 100644 --- a/youtube_dl/extractor/voot.py +++ b/youtube_dl/extractor/voot.py @@ -2,54 +2,97 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + unified_timestamp, +) class VootIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?voot\.com/shows/(?:.+?[/-]?)/1/(?:.+?[0-9]?)/(?:.+?[/-]?)/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)' + _GEO_COUNTRIES = ['IN'] + _TESTS = [{ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353', 'info_dict': { - 'id': '441353', + 'id': '0_8ledb18o', 'ext': 'mp4', 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - _GET_CONTENT_TEMPLATE = 'https://wapi.voot.com/ws/ott/getMediaInfo.json?platform=Web&pId=3&mediaId=%s' - - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True): - json_data = super(VootIE, self)._download_json(url_or_request, video_id, note, fatal=fatal) - if json_data['status']['code'] != 0: - if fatal: - raise ExtractorError(json_data['status']['message']) - return None - return json_data['assets'] + 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1', + 'uploader_id': 'batchUser', + 'timestamp': 1472162937, + 'upload_date': '20160825', + 'duration': 1146, + 'series': 'Ishq Ka Rang Safed', + 'season_number': 1, + 'episode': 'Is this the end of Kamini?', + 'episode_number': 340, + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925', + 'only_matching': True, + }, { + 'url': 'https://www.voot.com/movies/pandavas-5/424627', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - self._GET_CONTENT_TEMPLATE % video_id, - video_id) - thumbnail = '' - formats = [] + media_info = self._download_json( + 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id, + query={ + 'platform': 'Web', + 'pId': 2, + 'mediaId': video_id, + }) - if video_data: - format_url = video_data.get('URL') - formats.extend(self._extract_m3u8_formats(format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + status_code = try_get(media_info, lambda x: x['status']['code'], int) + if status_code != 0: + raise ExtractorError(media_info['status']['message'], expected=True) - if video_data['Pictures']: - for picture in video_data['Pictures']: - #Get only first available thumbnail - thumbnail = picture.get('URL') - break + media = media_info['assets'] - self._sort_formats(formats) + entry_id = media['EntryId'] + title = media['MediaName'] + + description, series, season_number, episode, episode_number = [None] * 5 + + for meta in try_get(media, lambda x: x['Metas'], list) or []: + key, value = meta.get('Key'), meta.get('Value') + if not key or not value: + continue + if key == 'ContentSynopsis': + description = value + elif key == 'RefSeriesTitle': + series = value + elif key == 'RefSeriesSeason': + season_number = int_or_none(value) + elif key == 'EpisodeMainTitle': + episode = value + elif key == 'EpisodeNo': + episode_number = int_or_none(value) return { - 'id': video_id, - 'title': video_data.get('MediaName'), - 'thumbnail': thumbnail, - 'formats':formats, + '_type': 'url_transparent', + 'url': 'kaltura:1982551:%s' % entry_id, + 'ie_key': KalturaIE.ie_key(), + 'title': title, + 'description': description, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'timestamp': unified_timestamp(media.get('CreationDate')), + 'duration': int_or_none(media.get('Duration')), + 'view_count': int_or_none(media.get('ViewCounter')), + 'like_count': int_or_none(media.get('like_counter')), } From 16afce174ea71690844d37776d518ae374b896ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 6 Aug 2017 08:18:16 +0700 Subject: [PATCH 041/238] [mpora] Remove extractor (closes #13826) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/mpora.py | 62 ------------------------------ 2 files changed, 63 deletions(-) delete mode 100644 youtube_dl/extractor/mpora.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ebe414dae..897557f93 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -584,7 +584,6 @@ from .mixcloud import ( ) from .mlb import MLBIE from .mnet import MnetIE -from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py deleted file mode 100644 index 5a1bee5c8..000000000 --- a/youtube_dl/extractor/mpora.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import int_or_none - - -class MporaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)' - IE_NAME = 'MPORA' - - _TEST = { - 'url': 'http://mpora.de/videos/AAdo8okx4wiz/embed?locale=de', - 'md5': 'a7a228473eedd3be741397cf452932eb', - 'info_dict': { - 'id': 'AAdo8okx4wiz', - 'ext': 'mp4', - 'title': 'Katy Curd - Winter in the Forest', - 'duration': 416, - 'uploader': 'Peter Newman Media', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - data_json = self._search_regex( - [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", - r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"], - webpage, 'json') - data = self._parse_json(data_json, video_id) - - uploader = data['info_overlay'].get('username') - duration = data['video']['duration'] // 1000 - thumbnail = data['video']['encodings']['sd']['poster'] - title = data['info_overlay']['title'] - - formats = [] - for encoding_id, edata in data['video']['encodings'].items(): - for src in edata['sources']: - width_str = self._search_regex( - r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'], - False, default=None) - vcodec = src['type'].partition('/')[2] - - formats.append({ - 'format_id': encoding_id + '-' + vcodec, - 'url': src['src'], - 'vcodec': vcodec, - 'width': int_or_none(width_str), - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'uploader': uploader, - 'duration': duration, - 'thumbnail': thumbnail, - } From fac188c6954edcccf3104abc3ac0155125a7d427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 6 Aug 2017 08:44:28 +0700 Subject: [PATCH 042/238] [pluralsight] Fix format selection --- youtube_dl/extractor/pluralsight.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index e45d9fe55..d35f54ce8 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -224,6 +224,7 @@ class PluralsightIE(PluralsightBaseIE): req_format_split = req_format.split('-', 1) if len(req_format_split) > 1: req_ext, req_quality = req_format_split + req_quality = '-'.join(req_quality.split('-')[:2]) for allowed_quality in ALLOWED_QUALITIES: if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: return (AllowedQuality(req_ext, (req_quality, )), ) From 92740e42414cb47f785daf257b9726fa361977b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 6 Aug 2017 09:02:14 +0700 Subject: [PATCH 043/238] [ChangeLog] Actualize --- ChangeLog | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4f03ef064..e2515866e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +version <unreleased> + +Core +* Use relative paths for DASH fragments (#12990) + +Extractors +* [pluralsight] Fix format selection +- [mpora] Remove extractor (#13826) ++ [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218) +* [vlive:channel] Limit number of videos per page to 100 (#13830) +* [podomatic] Extend URL regular expression (#13827) +* [cinchcast] Extend URL regular expression +* [yandexdisk] Relax URL regular expression (#13824) +* [vidme] Extract DASH and HLS formats +- [teamfour] Remove extractor (#13782) +* [pornhd] Fix extraction (#13783) +* [udemy] Fix subtitles extraction (#13812) +* [mlb] Extend URL regular expression (#13740, #13773) ++ [pbs] Add support for new URL schema (#13801) +* [nrktv] Update API host (#13796) + + version 2017.07.30.1 Core From 903a183b6adc60808f04294a7003b6d4bd250304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 6 Aug 2017 09:05:36 +0700 Subject: [PATCH 044/238] release 2017.08.06 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +-- youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 0421de755..5b72032bc 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.07.30.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.07.30.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.06*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.06** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.07.30.1 +[debug] youtube-dl version 2017.08.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e2515866e..18893bba8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.08.06 Core * Use relative paths for DASH fragments (#12990) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 77aac8249..a3bd07726 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -472,7 +472,6 @@ - **MovieFap** - **Moviezine** - **MovingImage** - - **MPORA** - **MSN** - **mtg**: MTG services - **mtv** @@ -783,7 +782,6 @@ - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** - **Teamcoco** - - **TeamFourStar** - **TechTalks** - **techtv.mit.edu** - **ted** @@ -953,6 +951,7 @@ - **VODPl** - **VODPlatform** - **VoiceRepublic** + - **Voot** - **VoxMedia** - **Vporn** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 38162157d..11d3bf29f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.07.30.1' +__version__ = '2017.08.06' From 463e7216c87814edf1453aa3a5bfad89474ba6b1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 6 Aug 2017 23:07:06 +0800 Subject: [PATCH 045/238] [niconico] Support HTML5-only videos (closes #13806) --- ChangeLog | 6 ++ youtube_dl/extractor/niconico.py | 124 +++++++++++++++++++++---------- 2 files changed, 90 insertions(+), 40 deletions(-) diff --git a/ChangeLog b/ChangeLog index 18893bba8..7cd385e68 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors ++ [niconico] Support HTML5-only videos (#13806) + + version 2017.08.06 Core diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 79b9952c3..b13dc0035 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -11,10 +11,13 @@ from ..compat import ( ) from ..utils import ( determine_ext, + dict_get, ExtractorError, int_or_none, parse_duration, parse_iso8601, + try_get, + unified_timestamp, urlencode_postdata, xpath_text, ) @@ -31,12 +34,15 @@ class NiconicoIE(InfoExtractor): 'id': 'sm22312215', 'ext': 'mp4', 'title': 'Big Buck Bunny', + 'thumbnail': r're:https?://.*', 'uploader': 'takuya0301', 'uploader_id': '2698420', 'upload_date': '20131123', 'timestamp': 1385182762, 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'duration': 33, + 'view_count': int, + 'comment_count': int, }, 'skip': 'Requires an account', }, { @@ -48,6 +54,7 @@ class NiconicoIE(InfoExtractor): 'ext': 'swf', 'title': '【鏡音リン】Dance on media【オリジナル】take2!', 'description': 'md5:689f066d74610b3b22e0f1739add0f58', + 'thumbnail': r're:https?://.*', 'uploader': 'りょうた', 'uploader_id': '18822557', 'upload_date': '20110429', @@ -64,9 +71,11 @@ class NiconicoIE(InfoExtractor): 'ext': 'unknown_video', 'description': 'deleted', 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', + 'thumbnail': r're:https?://.*', 'upload_date': '20071224', 'timestamp': int, # timestamp field has different value if logged in 'duration': 304, + 'view_count': int, }, 'skip': 'Requires an account', }, { @@ -76,12 +85,31 @@ class NiconicoIE(InfoExtractor): 'ext': 'mp4', 'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~', 'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1', + 'thumbnail': r're:https?://.*', 'timestamp': 1388851200, 'upload_date': '20140104', 'uploader': 'アニメロチャンネル', 'uploader_id': '312', }, 'skip': 'The viewing period of the video you were searching for has expired.', + }, { + # video not available via `getflv` + 'url': 'http://www.nicovideo.jp/watch/sm1151009', + 'info_dict': { + 'id': 'sm1151009', + 'ext': 'flv', + 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', + 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', + 'thumbnail': r're:https?://.*', + 'duration': 184, + 'timestamp': 1190868283, + 'upload_date': '20070927', + 'uploader': 'denden2', + 'uploader_id': '1392194', + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -130,33 +158,51 @@ class NiconicoIE(InfoExtractor): if video_id.startswith('so'): video_id = self._match_id(handle.geturl()) - video_info = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, - note='Downloading video info page') + api_data = self._parse_json(self._html_search_regex( + 'data-api-data="([^"]+)"', webpage, + 'API data', default='{}'), video_id) + video_real_url = try_get( + api_data, lambda x: x['video']['smileInfo']['url']) - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') + if video_real_url: + def get_video_info(items): + return dict_get(api_data['video'], items) + else: + # Get flv info + flv_info_webpage = self._download_webpage( + 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', + video_id, 'Downloading flv info') - flv_info = compat_urlparse.parse_qs(flv_info_webpage) - if 'url' not in flv_info: - if 'deleted' in flv_info: - raise ExtractorError('The video has been deleted.', - expected=True) - elif 'closed' in flv_info: - raise ExtractorError('Niconico videos now require logging in', - expected=True) - elif 'error' in flv_info: - raise ExtractorError('%s reports error: %s' % ( - self.IE_NAME, flv_info['error'][0]), expected=True) - else: - raise ExtractorError('Unable to find video URL') + flv_info = compat_urlparse.parse_qs(flv_info_webpage) + if 'url' not in flv_info: + if 'deleted' in flv_info: + raise ExtractorError('The video has been deleted.', + expected=True) + elif 'closed' in flv_info: + raise ExtractorError('Niconico videos now require logging in', + expected=True) + elif 'error' in flv_info: + raise ExtractorError('%s reports error: %s' % ( + self.IE_NAME, flv_info['error'][0]), expected=True) + else: + raise ExtractorError('Unable to find video URL') - video_real_url = flv_info['url'][0] + video_real_url = flv_info['url'][0] + + video_info_xml = self._download_xml( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, + video_id, note='Downloading video info page') + + def get_video_info(items): + if not isinstance(items, list): + items = [items] + for item in items: + ret = xpath_text(video_info_xml, './/' + item) + if ret: + return ret # Start extracting information - title = xpath_text(video_info, './/title') + title = get_video_info('title') if not title: title = self._og_search_title(webpage, default=None) if not title: @@ -170,18 +216,19 @@ class NiconicoIE(InfoExtractor): watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} video_detail = watch_api_data.get('videoDetail', {}) - extension = xpath_text(video_info, './/movie_type') + extension = get_video_info(['movie_type', 'movieType']) if not extension: extension = determine_ext(video_real_url) thumbnail = ( - xpath_text(video_info, './/thumbnail_url') or + get_video_info(['thumbnail_url', 'thumbnailURL']) or self._html_search_meta('image', webpage, 'thumbnail', default=None) or video_detail.get('thumbnail')) - description = xpath_text(video_info, './/description') + description = get_video_info('description') - timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve')) + timestamp = (parse_iso8601(get_video_info('first_retrieve')) or + unified_timestamp(get_video_info('postedDateTime'))) if not timestamp: match = self._html_search_meta('datePublished', webpage, 'date published', default=None) if match: @@ -191,7 +238,7 @@ class NiconicoIE(InfoExtractor): video_detail['postedAt'].replace('/', '-'), delimiter=' ', timezone=datetime.timedelta(hours=9)) - view_count = int_or_none(xpath_text(video_info, './/view_counter')) + view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) if not view_count: match = self._html_search_regex( r'>Views: <strong[^>]*>([^<]+)</strong>', @@ -200,31 +247,28 @@ class NiconicoIE(InfoExtractor): view_count = int_or_none(match.replace(',', '')) view_count = view_count or video_detail.get('viewCount') - comment_count = int_or_none(xpath_text(video_info, './/comment_num')) + comment_count = (int_or_none(get_video_info('comment_num')) or + video_detail.get('commentCount') or + try_get(api_data, lambda x: x['thread']['commentCount'])) if not comment_count: match = self._html_search_regex( r'>Comments: <strong[^>]*>([^<]+)</strong>', webpage, 'comment count', default=None) if match: comment_count = int_or_none(match.replace(',', '')) - comment_count = comment_count or video_detail.get('commentCount') duration = (parse_duration( - xpath_text(video_info, './/length') or + get_video_info('length') or self._html_search_meta( 'video:duration', webpage, 'video duration', default=None)) or - video_detail.get('length')) + video_detail.get('length') or + get_video_info('duration')) - webpage_url = xpath_text(video_info, './/watch_url') or url + webpage_url = get_video_info('watch_url') or url - if video_info.find('.//ch_id') is not None: - uploader_id = video_info.find('.//ch_id').text - uploader = video_info.find('.//ch_name').text - elif video_info.find('.//user_id') is not None: - uploader_id = video_info.find('.//user_id').text - uploader = video_info.find('.//user_nickname').text - else: - uploader_id = uploader = None + owner = api_data.get('owner', {}) + uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') + uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') return { 'id': video_id, From ee6a611665a1ee8583ce84bd9d36d03b6f697895 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 7 Aug 2017 00:19:46 +0800 Subject: [PATCH 046/238] [niconico] Support videos with multiple formats (closes #13522) --- ChangeLog | 1 + youtube_dl/extractor/niconico.py | 161 +++++++++++++++++++++++++++---- 2 files changed, 145 insertions(+), 17 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7cd385e68..084e98c0e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors ++ [niconico] Support videos with multiple formats (#13522) + [niconico] Support HTML5-only videos (#13806) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index b13dc0035..026329d3e 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -14,8 +14,10 @@ from ..utils import ( dict_get, ExtractorError, int_or_none, + float_or_none, parse_duration, parse_iso8601, + remove_start, try_get, unified_timestamp, urlencode_postdata, @@ -93,11 +95,12 @@ class NiconicoIE(InfoExtractor): }, 'skip': 'The viewing period of the video you were searching for has expired.', }, { - # video not available via `getflv` + # video not available via `getflv`; "old" HTML5 video 'url': 'http://www.nicovideo.jp/watch/sm1151009', + 'md5': '8fa81c364eb619d4085354eab075598a', 'info_dict': { 'id': 'sm1151009', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)', 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7', 'thumbnail': r're:https?://.*', @@ -110,6 +113,25 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, }, 'skip': 'Requires an account', + }, { + # "New" HTML5 video + 'url': 'http://www.nicovideo.jp/watch/sm31464864', + 'md5': '351647b4917660986dc0fa8864085135', + 'info_dict': { + 'id': 'sm31464864', + 'ext': 'mp4', + 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質', + 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', + 'timestamp': 1498514060, + 'upload_date': '20170626', + 'uploader': 'ゲス', + 'uploader_id': '40826363', + 'thumbnail': r're:https?://.*', + 'duration': 198, + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -147,6 +169,84 @@ class NiconicoIE(InfoExtractor): self._downloader.report_warning('unable to log in: bad username or password') return login_ok + def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): + def yesno(boolean): + return 'yes' if boolean else 'no' + + session_api_data = api_data['video']['dmcInfo']['session_api'] + session_api_endpoint = session_api_data['urls'][0] + + format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) + + session_response = self._download_json( + session_api_endpoint['url'], video_id, + query={'_format': 'json'}, + headers={'Content-Type': 'application/json'}, + note='Downloading JSON metadata for %s' % format_id, + data=json.dumps({ + 'session': { + 'client_info': { + 'player_id': session_api_data['player_id'], + }, + 'content_auth': { + 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], + 'content_key_timeout': session_api_data['content_key_timeout'], + 'service_id': 'nicovideo', + 'service_user_id': session_api_data['service_user_id'] + }, + 'content_id': session_api_data['content_id'], + 'content_src_id_sets': [{ + 'content_src_ids': [{ + 'src_id_to_mux': { + 'audio_src_ids': [audio_quality['id']], + 'video_src_ids': [video_quality['id']], + } + }] + }], + 'content_type': 'movie', + 'content_uri': '', + 'keep_method': { + 'heartbeat': { + 'lifetime': session_api_data['heartbeat_lifetime'] + } + }, + 'priority': session_api_data['priority'], + 'protocol': { + 'name': 'http', + 'parameters': { + 'http_parameters': { + 'parameters': { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_endpoint['is_ssl']), + 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']), + } + } + } + } + }, + 'recipe_id': session_api_data['recipe_id'], + 'session_operation_auth': { + 'session_operation_auth_by_signature': { + 'signature': session_api_data['signature'], + 'token': session_api_data['token'], + } + }, + 'timing_constraint': 'unlimited' + } + })) + + resolution = video_quality.get('resolution', {}) + + return { + 'url': session_response['data']['session']['content_uri'], + 'format_id': format_id, + 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 + 'abr': float_or_none(audio_quality.get('bitrate'), 1000), + 'vbr': float_or_none(video_quality.get('bitrate'), 1000), + 'height': resolution.get('height'), + 'width': resolution.get('width'), + } + def _real_extract(self, url): video_id = self._match_id(url) @@ -161,13 +261,13 @@ class NiconicoIE(InfoExtractor): api_data = self._parse_json(self._html_search_regex( 'data-api-data="([^"]+)"', webpage, 'API data', default='{}'), video_id) - video_real_url = try_get( - api_data, lambda x: x['video']['smileInfo']['url']) - if video_real_url: - def get_video_info(items): - return dict_get(api_data['video'], items) - else: + def _format_id_from_url(video_url): + return 'economy' if video_real_url.endswith('low') else 'normal' + + try: + video_real_url = api_data['video']['smileInfo']['url'] + except KeyError: # Flash videos # Get flv info flv_info_webpage = self._download_webpage( 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', @@ -187,8 +287,6 @@ class NiconicoIE(InfoExtractor): else: raise ExtractorError('Unable to find video URL') - video_real_url = flv_info['url'][0] - video_info_xml = self._download_xml( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note='Downloading video info page') @@ -201,6 +299,41 @@ class NiconicoIE(InfoExtractor): if ret: return ret + video_real_url = flv_info['url'][0] + + extension = get_video_info('movie_type') + if not extension: + extension = determine_ext(video_real_url) + + formats = [{ + 'url': video_real_url, + 'ext': extension, + 'format_id': _format_id_from_url(video_real_url), + }] + else: + formats = [] + + dmc_info = api_data['video'].get('dmcInfo') + if dmc_info: # "New" HTML5 videos + quality_info = dmc_info['quality'] + for audio_quality in quality_info['audios']: + for video_quality in quality_info['videos']: + if not audio_quality['available'] or not video_quality['available']: + continue + formats.append(self._extract_format_for_quality( + api_data, video_id, audio_quality, video_quality)) + + self._sort_formats(formats) + else: # "Old" HTML5 videos + formats = [{ + 'url': video_real_url, + 'ext': 'mp4', + 'format_id': _format_id_from_url(video_real_url), + }] + + def get_video_info(items): + return dict_get(api_data['video'], items) + # Start extracting information title = get_video_info('title') if not title: @@ -216,10 +349,6 @@ class NiconicoIE(InfoExtractor): watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} video_detail = watch_api_data.get('videoDetail', {}) - extension = get_video_info(['movie_type', 'movieType']) - if not extension: - extension = determine_ext(video_real_url) - thumbnail = ( get_video_info(['thumbnail_url', 'thumbnailURL']) or self._html_search_meta('image', webpage, 'thumbnail', default=None) or @@ -272,10 +401,8 @@ class NiconicoIE(InfoExtractor): return { 'id': video_id, - 'url': video_real_url, 'title': title, - 'ext': extension, - 'format_id': 'economy' if video_real_url.endswith('low') else 'normal', + 'formats': formats, 'thumbnail': thumbnail, 'description': description, 'uploader': uploader, From 15d1e8a23dbaa28635cae30ff6c5cfb095b4c7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 7 Aug 2017 22:43:42 +0700 Subject: [PATCH 047/238] [dplayit] Fix extraction (closes #13851) --- youtube_dl/extractor/dplay.py | 66 ++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 1a41760f8..76e784105 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -7,16 +7,18 @@ import time from .common import InfoExtractor from ..compat import ( - compat_urlparse, compat_HTTPError, + compat_str, + compat_urlparse, ) from ..utils import ( - USER_AGENTS, ExtractorError, int_or_none, - unified_strdate, remove_end, + try_get, + unified_strdate, update_url_query, + USER_AGENTS, ) @@ -183,28 +185,44 @@ class DPlayItIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - info_url = self._search_regex( - r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', - webpage, 'video id') - title = remove_end(self._og_search_title(webpage), ' | Dplay') - try: - info = self._download_json( - info_url, display_id, headers={ - 'Authorization': 'Bearer %s' % self._get_cookies(url).get( - 'dplayit_token').value, - 'Referer': url, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - if error.get('code') == 'access.denied.geoblocked': - self.raise_geo_restricted( - msg=error.get('detail'), countries=self._GEO_COUNTRIES) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - raise + video_id = None + + info = self._search_regex( + r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")', + webpage, 'playback JSON', default=None) + if info: + for _ in range(2): + info = self._parse_json(info, display_id, fatal=False) + if not info: + break + else: + video_id = try_get(info, lambda x: x['data']['id']) + + if not info: + info_url = self._search_regex( + r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', + webpage, 'info url') + + video_id = info_url.rpartition('/')[-1] + + try: + info = self._download_json( + info_url, display_id, headers={ + 'Authorization': 'Bearer %s' % self._get_cookies(url).get( + 'dplayit_token').value, + 'Referer': url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): + info = self._parse_json(e.cause.read().decode('utf-8'), display_id) + error = info['errors'][0] + if error.get('code') == 'access.denied.geoblocked': + self.raise_geo_restricted( + msg=error.get('detail'), countries=self._GEO_COUNTRIES) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + raise hls_url = info['data']['attributes']['streaming']['hls']['url'] @@ -230,7 +248,7 @@ class DPlayItIE(InfoExtractor): season_number = episode_number = upload_date = None return { - 'id': info_url.rpartition('/')[-1], + 'id': compat_str(video_id or display_id), 'display_id': display_id, 'title': title, 'description': self._og_search_description(webpage), From 4bf22f7a1014c55e3358b5a419945071b152eafc Mon Sep 17 00:00:00 2001 From: Alex Seiler <seileralex@gmail.com> Date: Tue, 8 Aug 2017 00:41:38 +0200 Subject: [PATCH 048/238] [20min] Fix embeds extraction --- youtube_dl/extractor/twentymin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index 4fd1aa4bf..a42977f39 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -50,7 +50,7 @@ class TwentyMinutenIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', + r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', webpage)] def _real_extract(self, url): From 5b232f46dcbdc805507c02edd4fd598f31d544d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 9 Aug 2017 22:28:19 +0700 Subject: [PATCH 049/238] [utils] Skip missing params in cli_bool_option (closes #13865) --- test/test_utils.py | 4 ++++ youtube_dl/utils.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 7803e5bc7..2aab16b97 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1182,6 +1182,10 @@ part 3</font></u> cli_bool_option( {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), ['--check-certificate=true']) + self.assertEqual( + cli_bool_option( + {}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), + []) def test_ohdave_rsa_encrypt(self): N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fdf5e29e7..c9cbd5842 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2733,6 +2733,8 @@ def cli_option(params, command_option, param): def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): param = params.get(param) + if param is None: + return [] assert isinstance(param, bool) if separator: return [command_option + separator + (true_value if param else false_value)] From 5b3ddadcc3012c4ef390a7cf70dbcb8b83f07428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 9 Aug 2017 22:55:13 +0700 Subject: [PATCH 050/238] [mixcloud] Fix play info decryption (closes #13867) --- youtube_dl/extractor/mixcloud.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 0efbe660a..40cd2e389 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -54,15 +54,22 @@ class MixcloudIE(InfoExtractor): }] # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js - @staticmethod - def _decrypt_play_info(play_info): - KEY = 'pleasedontdownloadourmusictheartistswontgetpaid' - + def _decrypt_play_info(self, play_info, video_id): + KEYS = ( + 'pleasedontdownloadourmusictheartistswontgetpaid', + '(function() { return new Date().toLocaleDateString(); })()' + ) play_info = base64.b64decode(play_info.encode('ascii')) - - return ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)])) - for idx, ch in enumerate(play_info)]) + for num, key in enumerate(KEYS, start=1): + try: + return self._parse_json( + ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) + for idx, ch in enumerate(play_info)]), + video_id) + except ExtractorError: + if num == len(KEYS): + raise def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -78,8 +85,8 @@ class MixcloudIE(InfoExtractor): encrypted_play_info = self._search_regex( r'm-play-info="([^"]+)"', webpage, 'play info') - play_info = self._parse_json( - self._decrypt_play_info(encrypted_play_info), track_id) + + play_info = self._decrypt_play_info(encrypted_play_info, track_id) if message and 'stream_url' not in play_info: raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) From dee04d24a422c0ea5586d2f1d1f97f1e3e4ecf70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 9 Aug 2017 23:12:02 +0700 Subject: [PATCH 051/238] [nick] Add support for nick.com.pl (closes #13860) --- youtube_dl/extractor/nick.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index b688637bc..510b1c41f 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -75,7 +75,7 @@ class NickIE(MTVServicesInfoExtractor): class NickDeIE(MTVServicesInfoExtractor): IE_NAME = 'nick.de' - _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.de|nickelodeon\.(?:nl|at))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', 'only_matching': True, @@ -88,6 +88,9 @@ class NickDeIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht', 'only_matching': True, + }, { + 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom', + 'only_matching': True, }] def _extract_mrss_url(self, webpage, host): From baba5f4d1daa29c42b2ad56c06e3880f10b7b03d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 9 Aug 2017 23:46:49 +0700 Subject: [PATCH 052/238] [xxxymovies] Fix title extraction (closes #13868) --- youtube_dl/extractor/xxxymovies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py index 5c8f17eb2..e34ebe3a6 100644 --- a/youtube_dl/extractor/xxxymovies.py +++ b/youtube_dl/extractor/xxxymovies.py @@ -39,8 +39,8 @@ class XXXYMoviesIE(InfoExtractor): r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') title = self._html_search_regex( - [r'<div class="block_header">\s*<h1>([^<]+)</h1>', - r'<title>(.*?)\s*-\s*XXXYMovies\.com'], + [r']+\bclass="block_header"[^>]*>\s*

([^<]+)<', + r'(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)'], webpage, 'title') thumbnail = self._search_regex( From 0e7dfa7d16e9e013bfaa085a59b9bbe4b4d1dfb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 9 Aug 2017 23:49:53 +0700 Subject: [PATCH 053/238] [ChangeLog] Actualize --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index 084e98c0e..7c8eb92aa 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,14 @@ version +Core +* [utils] Skip missing params in cli_bool_option (#13865) + Extractors +* [xxxymovies] Fix title extraction (#13868) ++ [nick] Add support for nick.com.pl (#13860) +* [mixcloud] Fix play info decryption (#13867) +* [20min] Fix embeds extraction (#13852) +* [dplayit] Fix extraction (#13851) + [niconico] Support videos with multiple formats (#13522) + [niconico] Support HTML5-only videos (#13806) From 6ed99754bb6074454b5d4875cc7b8b442e763ec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 9 Aug 2017 23:52:22 +0700 Subject: [PATCH 054/238] release 2017.08.09 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 5b72032bc..7ee704e48 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.06*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.06** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.09** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.06 +[debug] youtube-dl version 2017.08.09 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7c8eb92aa..b28ade446 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.08.09 Core * [utils] Skip missing params in cli_bool_option (#13865) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 11d3bf29f..022172375 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.06' +__version__ = '2017.08.09' From 41918eaa5ce1225f7e0a94882e7c77919342210d Mon Sep 17 00:00:00 2001 From: tetra-eder <30865771+tetra-eder@users.noreply.github.com> Date: Fri, 11 Aug 2017 17:00:39 +0200 Subject: [PATCH 055/238] [generic] Add support for vzaar embeds --- youtube_dl/extractor/generic.py | 17 +++++++++++++++++ youtube_dl/extractor/vzaar.py | 8 ++++++++ 2 files changed, 25 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 34e814988..51acead66 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -98,6 +98,7 @@ from .wistia import WistiaIE from .mediaset import MediasetIE from .joj import JojIE from .megaphone import MegaphoneIE +from .vzaar import VzaarIE class GenericIE(InfoExtractor): @@ -1840,6 +1841,16 @@ class GenericIE(InfoExtractor): 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', }, }, + { + # vzaar embed + 'url': 'http://www.xruniversity.com/bdsm-lets-begin-melissa-moore/', + 'md5': 'cddc9fb8a8644a0a7742149eee95080b', + 'info_dict': { + 'id': '11002506', + 'ext': 'mp4', + 'title': 'XR-U SHOW: Ready Player Fuck - EP. 61', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2781,6 +2792,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) + # Look for vzaar embeds + vzaar_urls = VzaarIE._extract_urls(webpage) + if vzaar_urls: + return self.playlist_from_matches( + vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + # Look for Rutube embeds rutube_urls = RutubeIE._extract_urls(webpage) if rutube_urls: diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py index b270f08d1..02fcd52c7 100644 --- a/youtube_dl/extractor/vzaar.py +++ b/youtube_dl/extractor/vzaar.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -28,6 +30,12 @@ class VzaarIE(InfoExtractor): }, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( From 1663bd6e1c11bf6cbf290fcbbf12358207570faf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 11 Aug 2017 22:02:00 +0700 Subject: [PATCH 056/238] [generic] Replace vzaar embed test --- youtube_dl/extractor/generic.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 51acead66..8362d9a36 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1843,12 +1843,12 @@ class GenericIE(InfoExtractor): }, { # vzaar embed - 'url': 'http://www.xruniversity.com/bdsm-lets-begin-melissa-moore/', - 'md5': 'cddc9fb8a8644a0a7742149eee95080b', + 'url': 'http://help.vzaar.com/article/165-embedding-video', + 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', 'info_dict': { - 'id': '11002506', + 'id': '8707641', 'ext': 'mp4', - 'title': 'XR-U SHOW: Ready Player Fuck - EP. 61', + 'title': 'Building A Business Online: Principal Chairs Q & A', }, }, # { @@ -2792,12 +2792,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - # Look for Rutube embeds rutube_urls = RutubeIE._extract_urls(webpage) if rutube_urls: @@ -2828,6 +2822,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) + # Look for vzaar embeds + vzaar_urls = VzaarIE._extract_urls(webpage) + if vzaar_urls: + return self.playlist_from_matches( + vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): From 92a5c415328953851d0a6b7893de5387a1b7b469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Aug 2017 16:30:50 +0700 Subject: [PATCH 057/238] [mixcloud] Fix play info decryption (closes #13885) --- youtube_dl/extractor/mixcloud.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 40cd2e389..52f7428e0 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -57,7 +57,8 @@ class MixcloudIE(InfoExtractor): def _decrypt_play_info(self, play_info, video_id): KEYS = ( 'pleasedontdownloadourmusictheartistswontgetpaid', - '(function() { return new Date().toLocaleDateString(); })()' + 'window.addEventListener = window.addEventListener || function() {};', + '(function() { return new Date().toLocaleDateString(); })()', ) play_info = base64.b64decode(play_info.encode('ascii')) for num, key in enumerate(KEYS, start=1): From 82889d4ae517640df217f99e7744002e0deba47a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Aug 2017 16:48:11 +0700 Subject: [PATCH 058/238] [extractor/common] Respect source's type attribute for HTML5 media (closes #13892) --- youtube_dl/extractor/common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 459e7ffd6..4d61275fd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2114,9 +2114,9 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type): + def _media_formats(src, type_info, cur_media_type): full_url = absolute_url(src) - ext = determine_ext(full_url) + ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( @@ -2165,9 +2165,9 @@ class InfoExtractor(object): src = source_attributes.get('src') if not src: continue - is_plain_url, formats = _media_formats(src, media_type) + f = parse_content_type(source_attributes.get('type')) + is_plain_url, formats = _media_formats(src, f, media_type) if is_plain_url: - f = parse_content_type(source_attributes.get('type')) f.update(formats[0]) media_info['formats'].append(f) else: From ac8491fcca6f9c0f6c7904e1cf13953f912eeb39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Aug 2017 17:11:35 +0700 Subject: [PATCH 059/238] [extractor/common] Make _family_friendly_search optional --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4d61275fd..e565901af 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -940,7 +940,8 @@ class InfoExtractor(object): def _family_friendly_search(self, html): # See http://schema.org/VideoObject - family_friendly = self._html_search_meta('isFamilyFriendly', html) + family_friendly = self._html_search_meta( + 'isFamilyFriendly', html, default=None) if not family_friendly: return None From e74e3b63e3cdb31a61af1fc21c703e912c029b96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Aug 2017 17:14:11 +0700 Subject: [PATCH 060/238] [YoutubeDL] Make sure format id is not empty --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 367ae3533..df7378f83 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1500,7 +1500,7 @@ class YoutubeDL(object): sanitize_string_field(format, 'format_id') sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) - if format.get('format_id') is None: + if not format.get('format_id'): format['format_id'] = compat_str(i) else: # Sanitize format_id from characters used in format selector expression From 70851a95c307880f016fcb6f37427a8eeae73cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Aug 2017 17:18:23 +0700 Subject: [PATCH 061/238] [aparat] Extract all formats (closes #13887) --- youtube_dl/extractor/aparat.py | 49 +++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 025e29aa4..e394cb661 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, - HEADRequest, + int_or_none, + mimetype2ext, ) class AparatIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'http://www.aparat.com/v/wP8On', @@ -29,30 +29,41 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id - webpage = self._download_webpage(embed_url, video_id) - - file_list = self._parse_json(self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) - for i, item in enumerate(file_list[0]): - video_url = item['file'] - req = HEADRequest(video_url) - res = self._request_webpage( - req, video_id, note='Testing video URL %d' % i, errnote=False) - if res: - break - else: - raise ExtractorError('No working video URLs found') + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + + file_list = self._parse_json( + self._search_regex( + r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, + 'file list'), + video_id) + + formats = [] + for item in file_list[0]: + file_url = item.get('file') + if not file_url: + continue + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': label or ext, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', default=None)), + }) + self._sort_formats(formats) + thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, - 'url': video_url, - 'ext': 'mp4', 'thumbnail': thumbnail, 'age_limit': self._family_friendly_search(webpage), + 'formats': formats, } From 868f79db41a4d81a87ef12c8bd5ef73205c9c029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Aug 2017 19:24:26 +0700 Subject: [PATCH 062/238] [extractor/common] Fix _media_formats --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e565901af..7fe888462 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2115,7 +2115,7 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, type_info, cur_media_type): + def _media_formats(src, cur_media_type, type_info={}): full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -2167,7 +2167,7 @@ class InfoExtractor(object): if not src: continue f = parse_content_type(source_attributes.get('type')) - is_plain_url, formats = _media_formats(src, f, media_type) + is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: f.update(formats[0]) media_info['formats'].append(f) From 0c43a481b91c657643eb42f72d293f245a410c52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Aug 2017 23:24:46 +0700 Subject: [PATCH 063/238] [reddit] Add extractors (closes #13847) --- youtube_dl/extractor/extractors.py | 4 + youtube_dl/extractor/reddit.py | 114 +++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 youtube_dl/extractor/reddit.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 897557f93..f1a9f6edf 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -840,6 +840,10 @@ from .rai import ( from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import RedBullTVIE +from .reddit import ( + RedditIE, + RedditRIE, +) from .redtube import RedTubeIE from .regiotv import RegioTVIE from .rentv import ( diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py new file mode 100644 index 000000000..01c85ee01 --- /dev/null +++ b/youtube_dl/extractor/reddit.py @@ -0,0 +1,114 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + float_or_none, +) + + +class RedditIE(InfoExtractor): + _VALID_URL = r'https?://v\.redd\.it/(?P[^/?#&]+)' + _TEST = { + # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '655d06ace653ea3b87bccfb1b27ec99d', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'zv89llsvexdz', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = self._extract_m3u8_formats( + 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + + formats.extend(self._extract_mpd_formats( + 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, + mpd_id='dash', fatal=False)) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + +class RedditRIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'title': 'That small heart attack.', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1501941939, + 'upload_date': '20170805', + 'uploader': 'Antw87', + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, { + 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', + 'only_matching': True, + }, { + # imgur + 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/', + 'only_matching': True, + }, { + # streamable + 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/', + 'only_matching': True, + }, { + # youtube + 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + url + '.json', video_id)[0]['data']['children'][0]['data'] + + video_url = data['url'] + + # Avoid recursing into the same reddit URL + if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: + raise ExtractorError('No media found', expected=True) + + over_18 = data.get('over_18') + if over_18 is True: + age_limit = 18 + elif over_18 is False: + age_limit = 0 + else: + age_limit = None + + return { + '_type': 'url_transparent', + 'url': video_url, + 'title': data.get('title'), + 'thumbnail': data.get('thumbnail'), + 'timestamp': float_or_none(data.get('created_utc')), + 'uploader': data.get('author'), + 'like_count': int_or_none(data.get('ups')), + 'dislike_count': int_or_none(data.get('downs')), + 'comment_count': int_or_none(data.get('num_comments')), + 'age_limit': age_limit, + } From 4ef9152428c4a000cb5fc76732fc579f1f4c1d69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Aug 2017 00:58:39 +0700 Subject: [PATCH 064/238] [limelight] Improve embeds detection (closes #13895) --- youtube_dl/extractor/limelight.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 0a5a3956c..ad65b2759 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -26,14 +26,16 @@ class LimelightBaseIE(InfoExtractor): 'Channel': 'channel', 'ChannelList': 'channel_list', } + + def smuggle(url): + return smuggle_url(url, {'source_url': source_url}) + entries = [] for kind, video_id in re.findall( r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P[a-z0-9]{32})', webpage): entries.append(cls.url_result( - smuggle_url( - 'limelight:%s:%s' % (lm[kind], video_id), - {'source_url': source_url}), + smuggle('limelight:%s:%s' % (lm[kind], video_id)), 'Limelight%s' % kind, video_id)) for mobj in re.finditer( # As per [1] class attribute should be exactly equal to @@ -49,10 +51,15 @@ class LimelightBaseIE(InfoExtractor): ''', webpage): kind, video_id = mobj.group('kind'), mobj.group('id') entries.append(cls.url_result( - smuggle_url( - 'limelight:%s:%s' % (kind, video_id), - {'source_url': source_url}), + smuggle('limelight:%s:%s' % (kind, video_id)), 'Limelight%s' % kind.capitalize(), video_id)) + # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page) + for video_id in re.findall( + r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P[a-z0-9]{32})', + webpage): + entries.append(cls.url_result( + smuggle('limelight:media:%s' % video_id), + LimelightMediaIE.ie_key(), video_id)) return entries def _call_playlist_service(self, item_id, method, fatal=True, referer=None): From eb02940cc7dc2233f2d7873c12165245a3c3c14e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Aug 2017 01:11:27 +0700 Subject: [PATCH 065/238] [generic] Add test for #13895 --- youtube_dl/extractor/generic.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8362d9a36..eff5fbfe8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1785,6 +1785,21 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 5, }, + { + # Limelight embed (LimelightPlayerUtil.embed) + 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', + 'info_dict': { + 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', + 'ext': 'mp4', + 'title': '07448641', + 'timestamp': 1499890639, + 'upload_date': '20170712', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['LimelightMedia'], + }, { 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', 'info_dict': { From b3c6515365ed415bbf813c0c2e6c12585824b77a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Aug 2017 07:23:29 +0700 Subject: [PATCH 066/238] [fourtube] Add support for other sites (closes #6022, closes #7859, closes #13901) --- youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/fourtube.py | 176 +++++++++++++++++++++++------ 2 files changed, 147 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f1a9f6edf..fb79a1736 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -350,7 +350,12 @@ from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE -from .fourtube import FourTubeIE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) from .fox import FOXIE from .fox9 import FOX9IE from .foxgay import FoxgayIE diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index e3fd08bcf..ad273a0e7 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,39 +3,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( parse_duration, parse_iso8601, - sanitized_Request, str_to_int, ) -class FourTubeIE(InfoExtractor): - IE_NAME = '4tube' - _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P\d+)' - - _TEST = { - 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', - 'md5': '6516c8ac63b03de06bc8eac14362db4f', - 'info_dict': { - 'id': '209733', - 'ext': 'mp4', - 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', - 'uploader': 'WCP Club', - 'uploader_id': 'wcp-club', - 'upload_date': '20131031', - 'timestamp': 1383263892, - 'duration': 583, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 18, - } - } - +class FourTubeBaseIE(InfoExtractor): def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') + + if kind == 'm' or not display_id: + url = self._URL_TEMPLATE % video_id + webpage = self._download_webpage(url, video_id) title = self._html_search_meta('name', webpage) @@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'', + r'', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'', + r'', webpage, 'uploader', fatal=False) categories_html = self._search_regex( @@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor): view_count = str_to_int(self._search_regex( r']+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', - webpage, 'view count', fatal=False)) + webpage, 'view count', default=None)) like_count = str_to_int(self._search_regex( r']+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) duration = parse_duration(self._html_search_meta('duration', webpage)) media_id = self._search_regex( @@ -87,12 +70,12 @@ class FourTubeIE(InfoExtractor): token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format( media_id, '+'.join(sources)) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - b'Origin': b'https://www.4tube.com', - } - token_req = sanitized_Request(token_url, b'{}', headers) - tokens = self._download_json(token_req, video_id) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) formats = [{ 'url': tokens[format]['token'], 'format_id': format + 'p', @@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor): 'duration': duration, 'age_limit': 18, } + + +class FourTubeIE(FourTubeBaseIE): + IE_NAME = '4tube' + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?4tube\.com/(?:videos|embed)/(?P\d+)(?:/(?P[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video' + _TESTS = [{ + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'timestamp': 1383263892, + 'duration': 583, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + }, { + 'url': 'http://www.4tube.com/embed/209733', + 'only_matching': True, + }, { + 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'only_matching': True, + }] + + +class FuxIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?fux\.com/(?:video|embed)/(?P\d+)(?:/(?P[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.fux.com/video/%s/video' + _TESTS = [{ + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'info_dict': { + 'id': '195359', + 'ext': 'mp4', + 'title': 'Awesome fucking in the kitchen ends with cum swallow', + 'uploader': 'alenci2342', + 'uploader_id': 'alenci2342', + 'upload_date': '20131230', + 'timestamp': 1388361660, + 'duration': 289, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.fux.com/embed/195359', + 'only_matching': True, + }, { + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'only_matching': True, + }] + + +class PornTubeIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?porntube\.com/(?:videos/(?P[^/]+)_|embed/)(?P\d+)' + _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', + 'info_dict': { + 'id': '7089759', + 'ext': 'mp4', + 'title': 'Teen couple doing anal', + 'uploader': 'Alexy', + 'uploader_id': 'Alexy', + 'upload_date': '20150606', + 'timestamp': 1433595647, + 'duration': 5052, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/embed/7089759', + 'only_matching': True, + }, { + 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759', + 'only_matching': True, + }] + + +class PornerBrosIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?pornerbros\.com/(?:videos/(?P[^/]+)_|embed/)(?P\d+)' + _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '181369', + 'ext': 'mp4', + 'title': 'Skinny brunette takes big cock down her anal hole', + 'uploader': 'PornerBros HD', + 'uploader_id': 'pornerbros-hd', + 'upload_date': '20130130', + 'timestamp': 1359527401, + 'duration': 1224, + 'view_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.pornerbros.com/embed/181369', + 'only_matching': True, + }, { + 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'only_matching': True, + }] From 475bcb225f6046e38b47594c504da6ec15bac113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Aug 2017 07:53:02 +0700 Subject: [PATCH 067/238] [pornhub:playlistbase] Skip videos from drop-down menu for all playlists (closes #12819, closes #13902) --- youtube_dl/extractor/pornhub.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index e032817f2..f6777cd26 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -227,20 +227,6 @@ class PornHubIE(InfoExtractor): class PornHubPlaylistBaseIE(InfoExtractor): def _extract_entries(self, webpage): - return [ - self.url_result( - 'http://www.pornhub.com/%s' % video_url, - PornHubIE.ie_key(), video_title=title) - for video_url, title in orderedSet(re.findall( - r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', - webpage)) - ] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see # https://github.com/rg3/youtube-dl/issues/11594). @@ -248,7 +234,21 @@ class PornHubPlaylistBaseIE(InfoExtractor): r'(?s)(]+class=["\']container.+)', webpage, 'container', default=webpage) - entries = self._extract_entries(container) + return [ + self.url_result( + 'http://www.pornhub.com/%s' % video_url, + PornHubIE.ie_key(), video_title=title) + for video_url, title in orderedSet(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"', + container)) + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = self._extract_entries(webpage) playlist = self._parse_json( self._search_regex( From 4f049e4aa866aef89d0f4b735fd89eb2ba84e809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Aug 2017 08:00:15 +0700 Subject: [PATCH 068/238] [ChangeLog] Actualize --- ChangeLog | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ChangeLog b/ChangeLog index b28ade446..daa3601ba 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +version + +Core +* [YoutubeDL] Make sure format id is not empty +* [extractor/common] Make _family_friendly_search optional +* [extractor/common] Respect source's type attribute for HTML5 media (#13892) + +Extractors +* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902) ++ [fourtube] Add support pornerbros.com (#6022) ++ [fourtube] Add support porntube.com (#7859, #13901) ++ [fourtube] Add support fux.com +* [limelight] Improve embeds detection (#13895) ++ [reddit] Add support for v.redd.it and reddit.com (#13847) +* [aparat] Extract all formats (#13887) +* [mixcloud] Fix play info decryption (#13885) ++ [generic] Add support for vzaar embeds (#13876) + + version 2017.08.09 Core From 16393d65355cdb1118e528d6dcb6d82f5f1c2b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Aug 2017 08:58:30 +0700 Subject: [PATCH 069/238] release 2017.08.13 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7ee704e48..3bd61e0a6 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.09*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.09** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.13*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.13** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.09 +[debug] youtube-dl version 2017.08.13 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index daa3601ba..6bafb1e8f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.08.13 Core * [YoutubeDL] Make sure format id is not empty diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a3bd07726..cc442742f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -294,6 +294,7 @@ - **Funimation** - **FunnyOrDie** - **Fusion** + - **Fux** - **FXNetworks** - **GameInformer** - **GameOne** @@ -621,6 +622,7 @@ - **PolskieRadio** - **PolskieRadioCategory** - **PornCom** + - **PornerBros** - **PornFlip** - **PornHd** - **PornHub**: PornHub and Thumbzilla @@ -629,6 +631,7 @@ - **Pornotube** - **PornoVoisines** - **PornoXO** + - **PornTube** - **PressTV** - **PrimeShareTV** - **PromptFile** @@ -654,6 +657,8 @@ - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** + - **Reddit** + - **RedditR** - **RedTube** - **RegioTV** - **RENTV** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 022172375..da855a602 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.09' +__version__ = '2017.08.13' From da20951a57bddd4a0102cd776ff93a2adc6db77d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 14 Aug 2017 22:39:05 +0700 Subject: [PATCH 070/238] [mixcloud] Extract decrypt key --- youtube_dl/extractor/mixcloud.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 52f7428e0..fcf7beeb2 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, + compat_str, compat_urllib_parse_unquote, compat_urlparse, ) @@ -53,15 +54,18 @@ class MixcloudIE(InfoExtractor): 'only_matching': True, }] + _keys = [ + 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };', + 'pleasedontdownloadourmusictheartistswontgetpaid', + 'window.addEventListener = window.addEventListener || function() {};', + '(function() { return new Date().toLocaleDateString(); })()' + ] + _current_key = None + # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js def _decrypt_play_info(self, play_info, video_id): - KEYS = ( - 'pleasedontdownloadourmusictheartistswontgetpaid', - 'window.addEventListener = window.addEventListener || function() {};', - '(function() { return new Date().toLocaleDateString(); })()', - ) play_info = base64.b64decode(play_info.encode('ascii')) - for num, key in enumerate(KEYS, start=1): + for num, key in enumerate(self._keys, start=1): try: return self._parse_json( ''.join([ @@ -69,7 +73,7 @@ class MixcloudIE(InfoExtractor): for idx, ch in enumerate(play_info)]), video_id) except ExtractorError: - if num == len(KEYS): + if num == len(self._keys): raise def _real_extract(self, url): @@ -80,6 +84,20 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) + if not self._current_key: + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', + webpage, 'js url', default=None) + if js_url: + js = self._download_webpage(js_url, track_id, fatal=False) + if js: + key = self._search_regex( + r'player\s*:\s*{.*?\bvalue\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + js, 'key', default=None, group='key') + if key and isinstance(key, compat_str): + self._keys.insert(0, key) + self._current_key = key + message = self._html_search_regex( r'(?s)]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) From 19ada898dc80a04ae1a2590c8886c9ec13958b03 Mon Sep 17 00:00:00 2001 From: forDream Date: Wed, 2 Aug 2017 11:12:17 +0800 Subject: [PATCH 071/238] fix QQ Music Url changed --- youtube_dl/extractor/qqmusic.py | 43 +++++++++++++++++---------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 17c27da46..6bff6baa8 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals import random -import time import re +import time from .common import InfoExtractor from ..utils import ( @@ -18,9 +18,9 @@ from ..utils import ( class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', + 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', 'md5': '9ce1c1c8445f561506d2e3cfb0255705', 'info_dict': { 'id': '004295Et37taLD', @@ -33,7 +33,7 @@ class QQMusicIE(InfoExtractor): } }, { 'note': 'There is no mp3-320 version of this song.', - 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', + 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html', 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', 'info_dict': { 'id': '004MsGEo3DdNxV', @@ -46,7 +46,7 @@ class QQMusicIE(InfoExtractor): } }, { 'note': 'lyrics not in .lrc format', - 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6', + 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html', 'info_dict': { 'id': '001JyApY11tIp6', 'ext': 'mp3', @@ -163,7 +163,8 @@ class QQPlaylistBaseIE(InfoExtractor): for item in re.findall(r'class="data"[^<>]*>([^<>]+)[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P[0-9A-Za-z]+)\.html' _TEST = { - 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', + 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html', 'info_dict': { 'id': '001BLpXF2DyJe2', 'title': '林俊杰', @@ -217,10 +218,10 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P[0-9A-Za-z]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', + 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', @@ -228,7 +229,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): }, 'playlist_count': 4, }, { - 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', + 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html', 'info_dict': { 'id': '002Y5a3b3AlCu3', 'title': '그리고...', @@ -246,7 +247,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): entries = [ self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + ".html", 'QQMusic', song['songmid'] ) for song in album['list'] ] album_name = album.get('name') @@ -260,17 +261,17 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P(top|global)_[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=toplist&p=global_123', + 'url': 'https://y.qq.com/n/yqq/toplist/123.html', 'info_dict': { 'id': 'global_123', 'title': '美国iTunes榜', }, 'playlist_count': 10, }, { - 'url': 'http://y.qq.com/#type=toplist&p=top_3', + 'url': 'https://y.qq.com/n/yqq/toplist/3.html', 'info_dict': { 'id': 'top_3', 'title': '巅峰榜·欧美', @@ -281,7 +282,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): }, 'playlist_count': 100, }, { - 'url': 'http://y.qq.com/#type=toplist&p=global_106', + 'url': 'https://y.qq.com/n/yqq/toplist/106.html', 'info_dict': { 'id': 'global_106', 'title': '韩国Mnet榜', @@ -301,7 +302,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): entries = [ self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + ".html", 'QQMusic', song['data']['songmid'] ) for song in toplist_json['songlist'] ] @@ -314,10 +315,10 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P[0-9]+)\.html' _TESTS = [{ - 'url': 'http://y.qq.com/#type=taoge&id=3462654915', + 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html', 'info_dict': { 'id': '3462654915', 'title': '韩国5月新歌精选下旬', @@ -326,7 +327,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): 'playlist_count': 40, 'skip': 'playlist gone', }, { - 'url': 'http://y.qq.com/#type=taoge&id=1374105607', + 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html', 'info_dict': { 'id': '1374105607', 'title': '易入人心的华语民谣', @@ -352,7 +353,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): cdlist = list_json['cdlist'][0] entries = [ self.url_result( - 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + ".html", 'QQMusic', song['songmid'] ) for song in cdlist['songlist'] ] From 5d1bd3b907d22eab7c47b8b408c07a26dbc358ea Mon Sep 17 00:00:00 2001 From: forDream Date: Wed, 2 Aug 2017 12:20:53 +0800 Subject: [PATCH 072/238] [qqmusic]update valid url --- youtube_dl/extractor/qqmusic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 6bff6baa8..7513acba9 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -261,7 +261,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P(top|global)_[0-9]+)\.html' + _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P[0-9]+)\.html' _TESTS = [{ 'url': 'https://y.qq.com/n/yqq/toplist/123.html', @@ -293,7 +293,9 @@ class QQMusicToplistIE(QQPlaylistBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - list_type, num_id = list_id.split("_") + # list_type, num_id = list_id.split("_") + list_type = "toplist" + num_id = list_id toplist_json = self._download_json( 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' From 5c037c0d1f155e951050533690d6e990654cfcc9 Mon Sep 17 00:00:00 2001 From: forDream Date: Wed, 2 Aug 2017 15:08:38 +0800 Subject: [PATCH 073/238] [qqmusic]support QQMusicSingerIE --- youtube_dl/extractor/qqmusic.py | 52 +++++++++++++++++---------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 7513acba9..42be6bc7b 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -156,16 +156,27 @@ class QQPlaylistBaseIE(InfoExtractor): def qq_static_url(category, mid): return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) - @classmethod - def get_entries_from_page(cls, page): + def get_singer_all_songs(self, singmid, num): + return self._download_webpage( + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg?format=json&inCharset=utf8&outCharset=utf-8&platform=yqq&needNewCode=0&singermid=%s&order=listen&begin=0&num=%s&songstatus=1' % + (singmid, num), singmid) + + def get_entries_from_page(self, singmid): entries = [] - for item in re.findall(r'class="data"[^<>]*>([^<>]+) Date: Mon, 14 Aug 2017 08:28:41 +0800 Subject: [PATCH 074/238] [qqmusic] review --- youtube_dl/extractor/qqmusic.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 42be6bc7b..38f4b2cab 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -166,15 +166,15 @@ class QQPlaylistBaseIE(InfoExtractor): default_num = 1 json_text = self.get_singer_all_songs(singmid, default_num) - json_obj = self._parse_json(json_text, singmid) + json_obj_all_songs = self._parse_json(json_text, singmid) - if json_obj['code'] == 0: - total = json_obj['data']['total'] + if json_obj_all_songs['code'] == 0: + total = json_obj_all_songs['data']['total'] json_text = self.get_singer_all_songs(singmid, total) - json_obj = self._parse_json(json_text, singmid) + json_obj_all_songs = self._parse_json(json_text, singmid) - for item in json_obj['data']['list']: - if not (item['musicData'].get('songmid') is None): + for item in json_obj_all_songs['data']['list']: + if item['musicData'].get('songmid') is not None: songmid = item['musicData']['songmid'] entries.append(self.url_result(r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) @@ -248,7 +248,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): entries = [ self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['songmid'] + ".html", 'QQMusic', song['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] ) for song in album['list'] ] album_name = album.get('name') @@ -294,8 +294,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - # list_type, num_id = list_id.split("_") - list_type = "toplist" + list_type = 'toplist' num_id = list_id toplist_json = self._download_json( @@ -305,7 +304,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): entries = [ self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + ".html", 'QQMusic', + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', song['data']['songmid'] ) for song in toplist_json['songlist'] ] @@ -357,7 +356,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): cdlist = list_json['cdlist'][0] entries = [ self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['songmid'] + ".html", 'QQMusic', song['songmid'] + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] ) for song in cdlist['songlist'] ] From 485047854376465f95309daad4966971f56728ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 15 Aug 2017 23:58:00 +0700 Subject: [PATCH 075/238] [extractor/common] Add support for float durations in _parse_mpd_formats (closes #13919) --- test/test_InfoExtractor.py | 86 ++++++++++++++++++++++++++++ test/testdata/mpd/float_duration.mpd | 18 ++++++ youtube_dl/extractor/common.py | 2 +- 3 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 test/testdata/mpd/float_duration.mpd diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6f52e11f7..f18a823fc 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -10,6 +10,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL, expect_dict, expect_value +from youtube_dl.compat import compat_etree_fromstring from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError @@ -488,6 +489,91 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + def test_parse_mpd_formats(self): + _TEST_CASES = [ + ( + # https://github.com/rg3/youtube-dl/issues/13919 + 'float_duration', + 'http://unknown/manifest.mpd', + [{ + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '318597', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.42001f', + 'tbr': 318.597, + 'width': 340, + 'height': 192, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '638590', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.42001f', + 'tbr': 638.59, + 'width': 512, + 'height': 288, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '1022565', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.4d001f', + 'tbr': 1022.565, + 'width': 688, + 'height': 384, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '2046506', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.4d001f', + 'tbr': 2046.506, + 'width': 1024, + 'height': 576, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '3998017', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.640029', + 'tbr': 3998.017, + 'width': 1280, + 'height': 720, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '5997485', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.640032', + 'tbr': 5997.485, + 'width': 1920, + 'height': 1080, + }] + ), + ] + + for mpd_file, mpd_url, expected_formats in _TEST_CASES: + with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, + mode='r', encoding='utf-8') as f: + formats = self.ie._parse_mpd_formats( + compat_etree_fromstring(f.read().encode('utf-8')), + mpd_url=mpd_url) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) + if __name__ == '__main__': unittest.main() diff --git a/test/testdata/mpd/float_duration.mpd b/test/testdata/mpd/float_duration.mpd new file mode 100644 index 000000000..8dc1d2d5e --- /dev/null +++ b/test/testdata/mpd/float_duration.mpd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7fe888462..e747258aa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1786,7 +1786,7 @@ class InfoExtractor(object): ms_info['timescale'] = int(timescale) segment_duration = source.get('duration') if segment_duration: - ms_info['segment_duration'] = int(segment_duration) + ms_info['segment_duration'] = float(segment_duration) def extract_Initialization(source): initialization = source.find(_add_ns('Initialization')) From a1aa6596626a98d068780f092367b87398840c47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Aug 2017 23:03:42 +0700 Subject: [PATCH 076/238] [periscope] Renew HLS extraction (closes #13917) --- youtube_dl/extractor/periscope.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index bfa12edc9..e5e08538c 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -80,18 +80,24 @@ class PeriscopeIE(PeriscopeBaseIE): stream = self._call_api( 'getAccessPublic', {'broadcast_id': token}, token) + video_urls = set() formats = [] - for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): + for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): video_url = stream.get(format_id + '_url') - if not video_url: + if not video_url or video_url in video_urls: continue - f = { + video_urls.add(video_url) + if format_id != 'rtmp': + formats.extend(self._extract_m3u8_formats( + video_url, token, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=False)) + continue + formats.append({ 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', - } - if format_id != 'rtmp': - f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8' - formats.append(f) + }) self._sort_formats(formats) return { From 25a6e769a1af3a79f439369fb683a1d487777cb9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 17 Aug 2017 16:39:57 +0800 Subject: [PATCH 077/238] [qqmusic] Fix tests and cleanup --- youtube_dl/extractor/qqmusic.py | 89 ++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 38f4b2cab..62f986050 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -7,11 +7,10 @@ import time from .common import InfoExtractor from ..utils import ( - sanitized_Request, - strip_jsonp, - unescapeHTML, clean_html, ExtractorError, + strip_jsonp, + unescapeHTML, ) @@ -21,14 +20,14 @@ class QQMusicIE(InfoExtractor): _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P[0-9A-Za-z]+)\.html' _TESTS = [{ 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html', - 'md5': '9ce1c1c8445f561506d2e3cfb0255705', + 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8', 'info_dict': { 'id': '004295Et37taLD', 'ext': 'mp3', 'title': '可惜没如果', 'release_date': '20141227', 'creator': '林俊杰', - 'description': 'md5:d327722d0361576fde558f1ac68a7065', + 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac', 'thumbnail': r're:^https?://.*\.jpg$', } }, { @@ -53,7 +52,7 @@ class QQMusicIE(InfoExtractor): 'title': 'Shadows Over Transylvania', 'release_date': '19970225', 'creator': 'Dark Funeral', - 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11', + 'description': 'md5:c9b20210587cbcd6836a1c597bab4525', 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { @@ -105,7 +104,7 @@ class QQMusicIE(InfoExtractor): [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) if albummid: - thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ + thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \ % (albummid[-2:-1], albummid[-1], albummid) guid = self.m_r_get_ruin() @@ -158,8 +157,19 @@ class QQPlaylistBaseIE(InfoExtractor): def get_singer_all_songs(self, singmid, num): return self._download_webpage( - r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg?format=json&inCharset=utf8&outCharset=utf-8&platform=yqq&needNewCode=0&singermid=%s&order=listen&begin=0&num=%s&songstatus=1' % - (singmid, num), singmid) + r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid, + query={ + 'format': 'json', + 'inCharset': 'utf8', + 'outCharset': 'utf-8', + 'platform': 'yqq', + 'needNewCode': 0, + 'singermid': singmid, + 'order': 'listen', + 'begin': 0, + 'num': num, + 'songstatus': 1, + }) def get_entries_from_page(self, singmid): entries = [] @@ -176,7 +186,8 @@ class QQPlaylistBaseIE(InfoExtractor): for item in json_obj_all_songs['data']['list']: if item['musicData'].get('songmid') is not None: songmid = item['musicData']['songmid'] - entries.append(self.url_result(r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) + entries.append(self.url_result( + r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid)) return entries @@ -192,7 +203,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): 'title': '林俊杰', 'description': 'md5:870ec08f7d8547c29c93010899103751', }, - 'playlist_count': 12, + 'playlist_mincount': 12, } def _real_extract(self, url): @@ -200,16 +211,16 @@ class QQMusicSingerIE(QQPlaylistBaseIE): entries = self.get_entries_from_page(mid) singer_page = self._download_webpage(url, mid, 'Download singer page') - singer_name = self._html_search_regex(r"singername : '(.*?)'", singer_page, 'singer name', default=None) + singer_name = self._html_search_regex( + r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None) singer_desc = None if mid: - req = sanitized_Request( - 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singermid=%s' % mid) - req.add_header( - 'Referer', 'https://y.qq.com/n/yqq/singer/') singer_desc_page = self._download_xml( - req, mid, 'Donwload singer description XML') + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid, + 'Donwload singer description XML', + query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid}, + headers={'Referer': 'https://y.qq.com/n/yqq/singer/'}) singer_desc = singer_desc_page.find('./data/info/desc').text @@ -267,26 +278,25 @@ class QQMusicToplistIE(QQPlaylistBaseIE): _TESTS = [{ 'url': 'https://y.qq.com/n/yqq/toplist/123.html', 'info_dict': { - 'id': 'global_123', + 'id': '123', 'title': '美国iTunes榜', + 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08', }, - 'playlist_count': 10, + 'playlist_count': 100, }, { 'url': 'https://y.qq.com/n/yqq/toplist/3.html', 'info_dict': { - 'id': 'top_3', + 'id': '3', 'title': '巅峰榜·欧美', - 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' - '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' - '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' - '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放' + 'description': 'md5:5a600d42c01696b26b71f8c4d43407da', }, 'playlist_count': 100, }, { 'url': 'https://y.qq.com/n/yqq/toplist/106.html', 'info_dict': { - 'id': 'global_106', + 'id': '106', 'title': '韩国Mnet榜', + 'description': 'md5:cb84b325215e1d21708c615cac82a6e7', }, 'playlist_count': 50, }] @@ -298,16 +308,14 @@ class QQMusicToplistIE(QQPlaylistBaseIE): num_id = list_id toplist_json = self._download_json( - 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' - % (list_type, num_id), - list_id, 'Download toplist page') + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, + note='Download toplist page', + query={'type': 'toplist', 'topid': list_id, 'format': 'json'}) - entries = [ - self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', - song['data']['songmid'] - ) for song in toplist_json['songlist'] - ] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic', + song['data']['songmid']) + for song in toplist_json['songlist']] topinfo = toplist_json.get('topinfo', {}) list_name = topinfo.get('ListName') @@ -343,8 +351,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): list_id = self._match_id(url) list_json = self._download_json( - 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' - % list_id, list_id, 'Download list page', + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg', + list_id, 'Download list page', + query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id}, transform_source=strip_jsonp) if not len(list_json.get('cdlist', [])): if list_json.get('code'): @@ -354,11 +363,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): raise ExtractorError('Unable to get playlist info') cdlist = list_json['cdlist'][0] - entries = [ - self.url_result( - 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'] - ) for song in cdlist['songlist'] - ] + entries = [self.url_result( + 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']) + for song in cdlist['songlist']] list_name = cdlist.get('dissname') list_description = clean_html(unescapeHTML(cdlist.get('desc'))) From 12f5304556343fafb6a38ad5b4d5ef9fc908f15c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 17 Aug 2017 16:40:56 +0800 Subject: [PATCH 078/238] [ChangeLog] Add entry for #13805 --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index 6bafb1e8f..5b897735e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version + +Extractors + ++ [qqmusic] Support new URL schemes (#13805) + + version 2017.08.13 Core From bfabd17b33d47f1e973121483623768010880845 Mon Sep 17 00:00:00 2001 From: Genki Sky Date: Tue, 8 Aug 2017 22:49:57 -0400 Subject: [PATCH 079/238] Add new extractor --- youtube_dl/extractor/clippit.py | 74 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 75 insertions(+) create mode 100644 youtube_dl/extractor/clippit.py diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py new file mode 100644 index 000000000..a1a7a774c --- /dev/null +++ b/youtube_dl/extractor/clippit.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + qualities, +) + +import re + + +class ClippitIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P[a-z]+)' + _TEST = { + 'url': 'https://www.clippituser.tv/c/evmgm', + 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', + 'info_dict': { + 'id': 'evmgm', + 'ext': 'mp4', + 'title': 'Bye bye Brutus. #BattleBots - Clippit', + 'uploader': 'lizllove', + 'uploader_url': 'https://www.clippituser.tv/p/lizllove', + 'timestamp': 1472183818, + 'upload_date': '20160826', + 'description': 'BattleBots | ABC', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'(.+?)', webpage, 'title') + + FORMATS = ('sd', 'hd') + quality = qualities(FORMATS) + formats = [] + for format_id in FORMATS: + url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, + webpage, 'url', fatal=False) + if not url: + continue + match = re.search(r'/(?P\d+)\.mp4', url) + formats.append({ + 'url': url, + 'format_id': format_id, + 'quality': quality(format_id), + 'height': int(match.group('height')) if match else None, + }) + + uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', + webpage, 'uploader', fatal=False) + uploader_url = ('https://www.clippituser.tv/p/' + uploader + if uploader else None) + + timestamp = self._html_search_regex(r'datetime="(.+?)"', + webpage, 'date', fatal=False) + thumbnail = self._html_search_regex(r'data-image="(.+?)"', + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'timestamp': parse_iso8601(timestamp), + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index fb79a1736..ccfa14e7a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -187,6 +187,7 @@ from .chirbit import ( from .cinchcast import CinchcastIE from .cjsw import CJSWIE from .cliphunter import CliphunterIE +from .clippit import ClippitIE from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE From 7ddab7742cad2ff04ec087e3e1d19422c931782b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 17 Aug 2017 16:56:37 +0800 Subject: [PATCH 080/238] [ChangeLog] Add an entry for Genki Sky's patch --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 5b897735e..cf7d1beb0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,7 @@ version Extractors - ++ [clippit] Add support for clippituser.tv + [qqmusic] Support new URL schemes (#13805) From 5d28169747e34850fcb53760c77eccb7f3195ef2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 17 Aug 2017 21:21:17 +0800 Subject: [PATCH 081/238] Credit Genki Sky for clippit (bfabd17b33d) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 053159cc3..478c7872f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -223,3 +223,4 @@ Jan Kundrát Giuseppe Fabiano Örn Guðjónsson Parmjit Virk +Genki Sky From 93d0583e34b0cd826f081a766b00381bb5fed52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 17 Aug 2017 22:45:40 +0700 Subject: [PATCH 082/238] [pluralsight] Use RPC API for course extraction (closes #13937) --- youtube_dl/extractor/pluralsight.py | 52 ++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index d35f54ce8..f6a9131b1 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -18,6 +18,7 @@ from ..utils import ( parse_duration, qualities, srt_subtitles_timecode, + try_get, update_url_query, urlencode_postdata, ) @@ -26,6 +27,39 @@ from ..utils import ( class PluralsightBaseIE(InfoExtractor): _API_BASE = 'https://app.pluralsight.com' + def _download_course(self, course_id, url, display_id): + try: + return self._download_course_rpc(course_id, url, display_id) + except ExtractorError: + # Old API fallback + return self._download_json( + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_id}), + headers={'Referer': url}) + + def _download_course_rpc(self, course_id, url, display_id): + response = self._download_json( + '%s/player/functions/rpc' % self._API_BASE, display_id, + 'Downloading course JSON', + data=json.dumps({ + 'fn': 'bootstrapPlayer', + 'payload': { + 'courseId': course_id, + }, + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + + course = try_get(response, lambda x: x['payload']['course'], dict) + if course: + return course + + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']['message']), + expected=True) + class PluralsightIE(PluralsightBaseIE): IE_NAME = 'pluralsight' @@ -162,10 +196,7 @@ class PluralsightIE(PluralsightBaseIE): display_id = '%s-%s' % (name, clip_id) - course = self._download_json( - 'https://app.pluralsight.com/player/user/api/v1/player/payload', - display_id, data=urlencode_postdata({'courseId': course_name}), - headers={'Referer': url}) + course = self._download_course(course_name, url, display_id) collection = course['modules'] @@ -331,18 +362,7 @@ class PluralsightCourseIE(PluralsightBaseIE): # TODO: PSM cookie - course = self._download_json( - '%s/player/functions/rpc' % self._API_BASE, course_id, - 'Downloading course JSON', - data=json.dumps({ - 'fn': 'bootstrapPlayer', - 'payload': { - 'courseId': course_id, - } - }).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8' - })['payload']['course'] + course = self._download_course(course_id, url, course_id) title = course['title'] course_name = course['name'] From 5f5c7b92dda1da6a0f15af7e3999a6ff298a8c92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 17 Aug 2017 23:14:46 +0700 Subject: [PATCH 083/238] [udemy] Fix paid course detection (#13943) --- youtube_dl/extractor/udemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 3b02f43e3..207c4a6a7 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -74,7 +74,7 @@ class UdemyIE(InfoExtractor): return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url checkout_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1', + r'href=(["\'])(?P(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) if checkout_url: raise ExtractorError( From 5551d7714d53caaaae32cdedad11a0bdc95efcf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 17 Aug 2017 23:57:48 +0700 Subject: [PATCH 084/238] [generic] Convert redirect URLs to unicode strings (closes #13951) --- youtube_dl/extractor/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index eff5fbfe8..d2fb2627d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2015,7 +2015,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = head_response.geturl() + new_url = compat_str(head_response.geturl()) if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -2116,7 +2116,7 @@ class GenericIE(InfoExtractor): elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, video_id, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict From a5ac0c475589fd1dcd3ba04802f28828c24be6c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 17 Aug 2017 23:59:12 +0700 Subject: [PATCH 085/238] [YoutubeDL] Sanitize byte string format URLs (#13951) --- youtube_dl/YoutubeDL.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index df7378f83..5f4c93ea3 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1483,12 +1483,14 @@ class YoutubeDL(object): def is_wellformed(f): url = f.get('url') - valid_url = url and isinstance(url, compat_str) - if not valid_url: + if not url: self.report_warning( '"url" field is missing or empty - skipping format, ' 'there is an error in extractor') - return valid_url + return False + if isinstance(url, bytes): + sanitize_string_field(f, 'url') + return True # Filter out malformed formats for better extraction robustness formats = list(filter(is_wellformed, formats)) From c0892b2b465cff95d392eaa725e39bd47e4dff58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 Aug 2017 00:58:23 +0700 Subject: [PATCH 086/238] [arte] Detect unavailable videos (closes #13945) --- youtube_dl/extractor/arte.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 56baef29d..02613cf5d 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -9,12 +9,13 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + ExtractorError, find_xpath_attr, - unified_strdate, get_element_by_attribute, int_or_none, NO_DEFAULT, qualities, + unified_strdate, ) # There are different sources of video in arte.tv, the extraction process @@ -79,6 +80,13 @@ class ArteTVBaseIE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] + vsr = player_info['VSR'] + + if not vsr and not player_info.get('VRU'): + raise ExtractorError( + 'Video %s is not available' % player_info.get('VID') or video_id, + expected=True) + upload_date_str = player_info.get('shootingDate') if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] @@ -107,7 +115,7 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] - for format_id, format_dict in player_info['VSR'].items(): + for format_id, format_dict in vsr.items(): f = dict(format_dict) versionCode = f.get('versionCode') l = re.escape(langcode) From 4a919103651905d4e5954c5d655b45055384e283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 Aug 2017 01:00:07 +0700 Subject: [PATCH 087/238] [qqmusic:toplist] PEP 8 --- youtube_dl/extractor/qqmusic.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 62f986050..084308aeb 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -304,9 +304,6 @@ class QQMusicToplistIE(QQPlaylistBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - list_type = 'toplist' - num_id = list_id - toplist_json = self._download_json( 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id, note='Download toplist page', From 2738965d98e1883a781a1e9743de0af086c5acd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 Aug 2017 01:03:20 +0700 Subject: [PATCH 088/238] [ChangeLog] Actualize --- ChangeLog | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ChangeLog b/ChangeLog index cf7d1beb0..298e0b059 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,19 @@ version +Core +* [YoutubeDL] Sanitize byte string format URLs (#13951) ++ [extractor/common] Add support for float durations in _parse_mpd_formats + (#13919) + Extractors +* [arte] Detect unavailable videos (#13945) +* [generic] Convert redirect URLs to unicode strings (#13951) +* [udemy] Fix paid course detection (#13943) +* [pluralsight] Use RPC API for course extraction (#13937) + [clippit] Add support for clippituser.tv + [qqmusic] Support new URL schemes (#13805) +* [periscope] Renew HLS extraction (#13917) +* [mixcloud] Extract decrypt key version 2017.08.13 From ea004d34f83fd7dd9a00fc3e2deb5a101aff6ea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 Aug 2017 01:05:27 +0700 Subject: [PATCH 089/238] release 2017.08.18 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3bd61e0a6..66dd4c480 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.13*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.13** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.18** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.13 +[debug] youtube-dl version 2017.08.18 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 298e0b059..9a0fad673 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.08.18 Core * [YoutubeDL] Sanitize byte string format URLs (#13951) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index cc442742f..1991975cc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -156,6 +156,7 @@ - **Cinchcast** - **CJSW** - **cliphunter** + - **Clippit** - **ClipRs** - **Clipsyndicate** - **CloserToTruth** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index da855a602..4358cd3f2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.13' +__version__ = '2017.08.18' From d14d9d8903a532e346dffc3b83730045f18f2c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 Aug 2017 23:31:42 +0700 Subject: [PATCH 090/238] [mixcloud] Fix extraction (closes #13958) --- youtube_dl/extractor/mixcloud.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index fcf7beeb2..798968ae3 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -91,12 +91,14 @@ class MixcloudIE(InfoExtractor): if js_url: js = self._download_webpage(js_url, track_id, fatal=False) if js: - key = self._search_regex( - r'player\s*:\s*{.*?\bvalue\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', - js, 'key', default=None, group='key') - if key and isinstance(key, compat_str): - self._keys.insert(0, key) - self._current_key = key + KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' + for key_name in ('value', 'key_value'): + key = self._search_regex( + KEY_RE_TEMPLATE % key_name, js, 'key', + default=None, group='key') + if key and isinstance(key, compat_str): + self._keys.insert(0, key) + self._current_key = key message = self._html_search_regex( r'(?s)]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', From f5469da9e6e259c1690c7ef54f1da1c19f65036f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Aug 2017 19:48:20 +0700 Subject: [PATCH 091/238] [laola1tv] Add support for tv.ittf.com (closes #13965) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/laola1tv.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ccfa14e7a..bda6826f1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -509,6 +509,7 @@ from .la7 import LA7IE from .laola1tv import ( Laola1TvEmbedIE, Laola1TvIE, + ITTFIE, ) from .lci import LCIIE from .lcp import ( diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 1f91ba017..c7f813370 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -215,3 +215,21 @@ class Laola1TvIE(Laola1TvEmbedIE): 'formats': formats, 'is_live': is_live, } + + +class ITTFIE(InfoExtractor): + _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P\d+)' + _TEST = { + 'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802', + 'only_matching': True, + } + + def _real_extract(self, url): + return self.url_result( + update_url_query('https://www.laola1.tv/titanplayer.php', { + 'videoid': self._match_id(url), + 'type': 'V', + 'lang': 'en', + 'portal': 'int', + 'customer': 1024, + }), Laola1TvEmbedIE.ie_key()) From 95f3f7c20a05e7ac490e768b8470b20538ef8581 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 19 Aug 2017 21:40:53 +0800 Subject: [PATCH 092/238] [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) --- ChangeLog | 6 ++++++ test/test_utils.py | 1 + youtube_dl/utils.py | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 9a0fad673..9eab4d1e7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Core +* [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) + + version 2017.08.18 Core diff --git a/test/test_utils.py b/test/test_utils.py index 2aab16b97..e50f3764e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -279,6 +279,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') + self.assertEqual(unescapeHTML('&a"'), '&a"') # HTML5 entities self.assertEqual(unescapeHTML('.''), '.\'') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c9cbd5842..2554a2abd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -596,7 +596,7 @@ def unescapeHTML(s): assert type(s) == compat_str return re.sub( - r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) def get_subprocess_encoding(): From f8f18f332f235bcfa2f8fc161887e0eef283fec0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 19 Aug 2017 21:44:47 +0800 Subject: [PATCH 093/238] [cda] Fix extraction (closes #13935) --- ChangeLog | 1 + youtube_dl/extractor/cda.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 9eab4d1e7..6c32747c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Core +* [cda] Fix extraction (closes #13935) * [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 78b7a923c..0c3af23d5 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -124,7 +124,7 @@ class CDAIE(InfoExtractor): } def extract_format(page, version): - json_str = self._search_regex( + json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P.+?)\1', page, '%s player_json' % version, fatal=False, group='player_data') if not json_str: From 09747ba7663a9c6f89530c7ffbd95cb4776db6bf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 19 Aug 2017 22:27:53 +0800 Subject: [PATCH 094/238] [liveleak] Support another liveleak embedding pattern (closes #13336) --- ChangeLog | 3 +++ youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/generic.py | 23 +++++++++++++++----- youtube_dl/extractor/liveleak.py | 35 +++++++++++++++++++++++++----- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6c32747c4..4c7997b2e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,9 @@ Core * [cda] Fix extraction (closes #13935) * [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) +Extractors ++ [liveleak] Support another liveleak embedding pattern (#13336) + version 2017.08.18 diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bda6826f1..17048fd6e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -537,7 +537,10 @@ from .limelight import ( LimelightChannelListIE, ) from .litv import LiTVIE -from .liveleak import LiveLeakIE +from .liveleak import ( + LiveLeakIE, + LiveLeakEmbedIE, +) from .livestream import ( LivestreamIE, LivestreamOriginalIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d2fb2627d..49b00b87e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1519,14 +1519,27 @@ class GenericIE(InfoExtractor): # LiveLeak embed { 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'md5': '7619da8c820e835bef21a1efa2a0fc71', 'info_dict': { 'id': '874_1459135191', 'ext': 'mp4', 'title': 'Man shows poor quality of new apartment building', 'description': 'The wall is like a sand pile.', 'uploader': 'Lake8737', - } + }, + 'add_ie': [LiveLeakIE.ie_key()], + }, + # Another LiveLeak embed pattern (#13336) + { + 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', + 'info_dict': { + 'id': '2eb_1496309988', + 'ext': 'mp4', + 'title': 'Thief robs place where everyone was armed', + 'description': 'md5:694d73ee79e535953cf2488562288eee', + 'uploader': 'brazilwtf', + }, + 'add_ie': [LiveLeakIE.ie_key()], }, # Duplicated embedded video URLs { @@ -2757,9 +2770,9 @@ class GenericIE(InfoExtractor): self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) # Look for LiveLeak embeds - liveleak_url = LiveLeakIE._extract_url(webpage) - if liveleak_url: - return self.url_result(liveleak_url, 'LiveLeak') + liveleak_urls = LiveLeakIE._extract_urls(webpage) + if liveleak_urls: + return self.playlist_from_matches(liveleak_urls, video_id, video_title) # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index b2247a84d..d23eaa355 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -75,12 +75,10 @@ class LiveLeakIE(InfoExtractor): }] @staticmethod - def _extract_url(webpage): - mobj = re.search( - r']+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P[\w_]+)(?:.*)', + def _extract_urls(webpage): + return re.findall( + r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"', webpage) - if mobj: - return 'http://www.liveleak.com/view?i=%s' % mobj.group('id') def _real_extract(self, url): video_id = self._match_id(url) @@ -131,3 +129,30 @@ class LiveLeakIE(InfoExtractor): }) return info_dict + + +class LiveLeakEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[if])=(?P[\w_]+)' + + # See generic.py for actual test cases + _TESTS = [{ + 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191', + 'only_matching': True, + }, { + 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id = mobj.group('kind', 'id') + + if kind == 'f': + webpage = self._download_webpage(url, video_id) + liveleak_url = self._search_regex( + r'logourl\s*:\s*(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, + webpage, 'LiveLeak URL', group='url') + elif kind == 'i': + liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + + return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) From e2481b9b6e621e43fd77e395fd2283ce262b71f3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 19 Aug 2017 22:28:58 +0800 Subject: [PATCH 095/238] [ChangeLog] Fix --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 4c7997b2e..320609a4f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,11 +1,11 @@ version Core -* [cda] Fix extraction (closes #13935) * [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) Extractors + [liveleak] Support another liveleak embedding pattern (#13336) +* [cda] Fix extraction (#13935) version 2017.08.18 From 381ad4f30998443fabc4c8633caa548685f49c6b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 19 Aug 2017 22:48:00 +0800 Subject: [PATCH 096/238] [liveleak] Support multi-video pages (closes #6542) --- ChangeLog | 1 + youtube_dl/extractor/liveleak.py | 43 ++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 320609a4f..c07cb9648 100644 --- a/ChangeLog +++ b/ChangeLog @@ -4,6 +4,7 @@ Core * [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) Extractors ++ [liveleak] Support multi-video pages (#6542) + [liveleak] Support another liveleak embedding pattern (#13336) * [cda] Fix extraction (#13935) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index d23eaa355..246aac576 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -72,6 +72,13 @@ class LiveLeakIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.liveleak.com/view?i=677_1439397581', + 'info_dict': { + 'id': '677_1439397581', + 'title': 'Fuel Depot in China Explosion caught on video', + }, + 'playlist_count': 3, }] @staticmethod @@ -109,26 +116,30 @@ class LiveLeakIE(InfoExtractor): 'age_limit': age_limit, } - info_dict = entries[0] + for idx, info_dict in enumerate(entries): + for a_format in info_dict['formats']: + if not a_format.get('height'): + a_format['height'] = int_or_none(self._search_regex( + r'([0-9]+)p\.mp4', a_format['url'], 'height label', + default=None)) - for a_format in info_dict['formats']: - if not a_format.get('height'): - a_format['height'] = int_or_none(self._search_regex( - r'([0-9]+)p\.mp4', a_format['url'], 'height label', - default=None)) + self._sort_formats(info_dict['formats']) - self._sort_formats(info_dict['formats']) + # Don't append entry ID for one-video pages to keep backward compatibility + if len(entries) > 1: + info_dict['id'] = '%s_%s' % (video_id, idx + 1) + else: + info_dict['id'] = video_id - info_dict.update({ - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - 'thumbnail': video_thumbnail, - }) + info_dict.update({ + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + 'thumbnail': video_thumbnail, + }) - return info_dict + return self.playlist_result(entries, video_id, video_title) class LiveLeakEmbedIE(InfoExtractor): From d3d45e0a451bab2cc36181bb50bf3c129a7a5ec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Aug 2017 23:54:15 +0700 Subject: [PATCH 097/238] [bbccouk] Add support for events URLs (closes #13893) --- youtube_dl/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 79ded6ba1..911ae6780 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -37,7 +37,8 @@ class BBCCoUkIE(InfoExtractor): programmes/(?!articles/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| - radio/player/ + radio/player/| + events/[^/]+/play/[^/]+/ ) (?P%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX From 305d99f0bd1effc0e164792199bf93a872da2962 Mon Sep 17 00:00:00 2001 From: "Bernhard M. Wiedemann" Date: Mon, 17 Jul 2017 13:49:09 +0200 Subject: [PATCH 098/238] [build] Override timestamps in zip file to make build reproducible. See https://reproducible-builds.org/ for why this is good Copying files to not interfere with freshness detection. --- Makefile | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 84ccce2b3..41e3a683a 100644 --- a/Makefile +++ b/Makefile @@ -46,8 +46,15 @@ tar: youtube-dl.tar.gz pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish youtube-dl: youtube_dl/*.py youtube_dl/*/*.py - zip --quiet youtube-dl youtube_dl/*.py youtube_dl/*/*.py - zip --quiet --junk-paths youtube-dl youtube_dl/__main__.py + mkdir -p zip + for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \ + mkdir -p zip/$$d ;\ + cp -a $$d/*.py zip/$$d/ ;\ + done + touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py + mv zip/youtube_dl/__main__.py zip/ + cd zip ; zip --quiet ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py + rm -rf zip echo '#!$(PYTHON)' > youtube-dl cat youtube-dl.zip >> youtube-dl rm youtube-dl.zip From b359e977b9bdff704cd58f6f3b34185ecbe450e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Aug 2017 14:16:58 +0700 Subject: [PATCH 099/238] [extractor/common] Make HLS and DASH extraction non fatal in _parse_html5_media_entries (closes #13970) --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e747258aa..ceba4ca1c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2123,11 +2123,11 @@ class InfoExtractor(object): formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference) + preference=preference, fatal=False) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id) + full_url, video_id, mpd_id=mpd_id, fatal=False) else: is_plain_url = True formats = [{ From 8239c6791a36813cacc337c5c4a8801d181b8b54 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Sun, 20 Aug 2017 09:32:33 -0700 Subject: [PATCH 100/238] [bandcamp:album] Extract track titles --- youtube_dl/extractor/bandcamp.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 9ddb9af17..be41bd5a2 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -242,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor): raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ - self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) + self.url_result( + compat_urlparse.urljoin(url, t_path), + ie=BandcampIE.ie_key(), + video_title=self._search_regex( + r']+\bitemprop=["\']name["\'][^>]*>([^<]+)', + elem_content, 'track title', fatal=False)) for elem_content, t_path in track_elements if self._html_search_meta('duration', elem_content, default=None)] From 903d4d1625f59b6fb359a898fbb512cb2d6181e9 Mon Sep 17 00:00:00 2001 From: Alan Yee Date: Sun, 20 Aug 2017 09:35:39 -0700 Subject: [PATCH 101/238] [README.md] Switch to HTTPS URLs --- README.md | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 0067184be..6f5d00df3 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl -Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`). +Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](https://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`). You can also use pip: @@ -33,7 +33,7 @@ You can also use pip: This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. -OS X users can install youtube-dl with [Homebrew](http://brew.sh/): +OS X users can install youtube-dl with [Homebrew](https://brew.sh/): brew install youtube-dl @@ -458,7 +458,7 @@ You can also use `--config-location` if you want to use custom configuration fil ### Authentication with `.netrc` file -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: ``` touch $HOME/.netrc chmod a-rwx,u+rw $HOME/.netrc @@ -485,7 +485,7 @@ The `-o` option allows users to indicate a template for the output file names. **tl;dr:** [navigate me to examples](#output-template-examples). -The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: - `id` (string): Video identifier - `title` (string): Video title @@ -603,7 +603,7 @@ $ youtube-dl -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext) $ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/ # Download entire series season keeping each series and each season in separate directory under C:/MyVideos -$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" http://videomore.ru/kino_v_detalayah/5_sezon/367617 +$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617 # Stream the video being downloaded to stdout $ youtube-dl -o - BaW_jenozKc @@ -716,17 +716,17 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231 ### How do I update youtube-dl? -If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`). +If you've followed [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`). If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to https://yt-dl.org to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like sudo apt-get remove -y youtube-dl -Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html): +Afterwards, simply follow [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html): ``` sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl @@ -766,11 +766,11 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much. youtube-dl works fine on its own on most sites. However, if you want to convert video/audio, you'll need [avconv](https://libav.org/) or [ffmpeg](https://www.ffmpeg.org/). On some sites - most notably YouTube - videos can be retrieved in a higher quality format without sound. youtube-dl will detect whether avconv/ffmpeg is present and automatically pick the best option. -Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](http://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. +Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](https://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](https://www.videolan.org/) or [mplayer](https://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my web browser. @@ -845,10 +845,10 @@ Use the `-o` to specify an [output template](#output-template), for example `-o ### How do I download a video starting with a `-`? -Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the options with `--`: +Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the options with `--`: youtube-dl -- -wNyEUrxzFU - youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" + youtube-dl "https://www.youtube.com/watch?v=-wNyEUrxzFU" ### How do I pass cookies to youtube-dl? @@ -862,9 +862,9 @@ Passing cookies to youtube-dl is a good way to workaround login when a particula ### How do I stream directly to media player? -You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with: +You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](https://www.videolan.org/) can be achieved with: - youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - + youtube-dl -o - "https://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - ### How do I download only new videos from a playlist? @@ -884,7 +884,7 @@ When youtube-dl detects an HLS video, it can download it either with the built-i When youtube-dl knows that one particular downloader works better for a given website, that downloader will be picked. Otherwise, youtube-dl will pick the best downloader for general compatibility, which at the moment happens to be ffmpeg. This choice may change in future versions of youtube-dl, with improvements of the built-in downloader and/or ffmpeg. -In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](http://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. +In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](https://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. If you put either `--hls-prefer-native` or `--hls-prefer-ffmpeg` into your configuration, a different subset of videos will fail to download correctly. Instead, it is much better to [file an issue](https://yt-dl.org/bug) or a pull request which details why the native or the ffmpeg HLS downloader is a better choice for your use case. @@ -910,7 +910,7 @@ Feel free to bump the issue from time to time by writing a small comment ("Issue ### How can I detect whether a given URL is supported by youtube-dl? -For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. +For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from https://example.com/video/1234567 to https://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor. @@ -924,7 +924,7 @@ youtube-dl is an open-source project manned by too few volunteers, so we'd rathe # DEVELOPER INSTRUCTIONS -Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. +Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution. To run youtube-dl as a developer, you don't need to build anything either. Simply execute @@ -972,7 +972,7 @@ After you have ensured this site is distributing its content legally, you can fo class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { - 'url': 'http://yourextractor.com/watch/42', + 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', @@ -1005,8 +1005,8 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py @@ -1162,7 +1162,7 @@ import youtube_dl ydl_opts = {} with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L129-L279). For a start, if you want to intercept youtube-dl's output, set a `logger` object. @@ -1201,19 +1201,19 @@ ydl_opts = { 'progress_hooks': [my_hook], } with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` $ youtube-dl -v [debug] System config: [] [debug] User config: [] -[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] youtube-dl version 2015.12.06 [debug] Git HEAD: 135392e @@ -1244,7 +1244,7 @@ For bug reports, this means that your report should contain the *complete* outpu If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). -**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL. +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? From 8d9c2a681a1dcf99ab949e79b1b9da17513e11d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 21 Aug 2017 23:06:27 +0700 Subject: [PATCH 102/238] [pornhub] Relax uploader regex (closes #13906, closes #13975) --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index f6777cd26..3428458af 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -186,7 +186,7 @@ class PornHubIE(InfoExtractor): title, thumbnail, duration = [None] * 3 video_uploader = self._html_search_regex( - r'(?s)From: .+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<', + r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', webpage, 'uploader', fatal=False) view_count = self._extract_count( From 05915e379a2406988f752722dfaa815804fb7fb8 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Tue, 22 Aug 2017 11:48:59 -0500 Subject: [PATCH 103/238] [googledrive] Add support for subtitles (fixes #13619) --- youtube_dl/extractor/googledrive.py | 104 +++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index c40da85c5..35edc7440 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -7,6 +7,8 @@ from ..utils import ( ExtractorError, int_or_none, lowercase_escape, + error_to_compat_str, + update_url_query, ) @@ -24,7 +26,14 @@ class GoogleDriveIE(InfoExtractor): }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'only_matching': True, + 'md5': 'c230c67252874fddd8170e3fd1a45886', + 'info_dict': { + 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', + 'ext': 'mp4', + 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', + 'duration': 189, + }, + 'only_matching': True }] _FORMATS_EXT = { '5': 'flv', @@ -44,6 +53,13 @@ class GoogleDriveIE(InfoExtractor): '46': 'webm', '59': 'mp4', } + _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' + _CAPTIONS_ENTRY_TAG = { + 'subtitles': 'track', + 'automatic_captions': 'target', + } + _caption_formats_ext = [] + _captions_by_country_xml = None @staticmethod def _extract_url(webpage): @@ -53,6 +69,81 @@ class GoogleDriveIE(InfoExtractor): if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') + def _set_captions_data(self, video_id, video_subtitles_id, hl): + try: + self._captions_by_country_xml = self._download_xml(self._BASE_URL_CAPTIONS, video_id, query={ + 'id': video_id, + 'vid': video_subtitles_id, + 'hl': hl, + 'v': video_id, + 'type': 'list', + 'tlangs': '1', + 'fmts': '1', + 'vssids': '1', + }) + except ExtractorError as ee: + self.report_warning('unable to download video subtitles: %s' % error_to_compat_str(ee)) + if self._captions_by_country_xml is not None: + caption_available_extensions = self._captions_by_country_xml.findall('format') + for caption_extension in caption_available_extensions: + if caption_extension.attrib.get('fmt_code') and not caption_extension.attrib.get('default'): + self._caption_formats_ext.append(caption_extension.attrib['fmt_code']) + + def _get_captions_by_type(self, video_id, video_subtitles_id, caption_type, caption_original_lang_code=None): + if not video_subtitles_id or not caption_type: + return None + captions = {} + for caption_entry in self._captions_by_country_xml.findall(self._CAPTIONS_ENTRY_TAG[caption_type]): + caption_lang_code = caption_entry.attrib.get('lang_code') + if not caption_lang_code: + continue + caption_format_data = [] + for caption_format in self._caption_formats_ext: + query = { + 'vid': video_subtitles_id, + 'v': video_id, + 'fmt': caption_format, + 'lang': caption_lang_code if caption_original_lang_code is None else caption_original_lang_code, + 'type': 'track', + 'name': '', + 'kind': '', + } + if caption_original_lang_code is not None: + query.update({'tlang': caption_lang_code}) + caption_format_data.append({ + 'url': update_url_query(self._BASE_URL_CAPTIONS, query), + 'ext': caption_format, + }) + captions[caption_lang_code] = caption_format_data + if not captions: + self.report_warning('video doesn\'t have %s' % caption_type.replace('_', ' ')) + return captions + + def _get_subtitles(self, video_id, video_subtitles_id, hl): + if not video_subtitles_id or not hl: + return None + if self._captions_by_country_xml is None: + self._set_captions_data(video_id, video_subtitles_id, hl) + if self._captions_by_country_xml is None: + return None + return self._get_captions_by_type(video_id, video_subtitles_id, 'subtitles') + + def _get_automatic_captions(self, video_id, video_subtitles_id, hl): + if not video_subtitles_id or not hl: + return None + if self._captions_by_country_xml is None: + self._set_captions_data(video_id, video_subtitles_id, hl) + if self._captions_by_country_xml is None: + return None + self.to_screen('%s: Looking for automatic captions' % video_id) + subtitle_original_track = self._captions_by_country_xml.find('track') + if subtitle_original_track is None: + return None + subtitle_original_lang_code = subtitle_original_track.attrib.get('lang_code') + if not subtitle_original_lang_code: + return None + return self._get_captions_by_type(video_id, video_subtitles_id, 'automatic_captions', subtitle_original_lang_code) + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( @@ -97,10 +188,21 @@ class GoogleDriveIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + hl = self._search_regex( + r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) + video_subtitles_id = None + ttsurl = self._search_regex( + r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) + if ttsurl: + # the video Id for subtitles will be the last value in the ttsurl query string + video_subtitles_id = ttsurl.encode('utf-8').decode('unicode_escape').split('=')[-1] + return { 'id': video_id, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, 'formats': formats, + 'subtitles': self.extract_subtitles(video_id, video_subtitles_id, hl), + 'automatic_captions': self.extract_automatic_captions(video_id, video_subtitles_id, hl), } From e01c3d2ef7264b5d3d6f99e7e0b61340885ed661 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Aug 2017 00:32:41 +0700 Subject: [PATCH 104/238] [extractor/common] Introduce _parse_xml --- youtube_dl/extractor/common.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ceba4ca1c..1804c4de0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..downloader.f4m import remove_encrypted_media from ..utils import ( @@ -646,15 +647,29 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal) + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: xml_string = transform_source(xml_string) - return compat_etree_fromstring(xml_string.encode('utf-8')) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except compat_xml_parse_error as ve: + errmsg = '%s: Failed to parse XML ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', From 37d9af306a928ce2184dcb60883e98ec0dd570ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Aug 2017 00:33:53 +0700 Subject: [PATCH 105/238] [googledrive] Simplify and carry long lines (#13638) --- youtube_dl/extractor/googledrive.py | 115 +++++++++++++++------------- 1 file changed, 60 insertions(+), 55 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 35edc7440..97ff28219 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -7,7 +7,6 @@ from ..utils import ( ExtractorError, int_or_none, lowercase_escape, - error_to_compat_str, update_url_query, ) @@ -59,7 +58,7 @@ class GoogleDriveIE(InfoExtractor): 'automatic_captions': 'target', } _caption_formats_ext = [] - _captions_by_country_xml = None + _captions_xml = None @staticmethod def _extract_url(webpage): @@ -69,96 +68,99 @@ class GoogleDriveIE(InfoExtractor): if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') - def _set_captions_data(self, video_id, video_subtitles_id, hl): - try: - self._captions_by_country_xml = self._download_xml(self._BASE_URL_CAPTIONS, video_id, query={ + def _download_subtitles_xml(self, video_id, subtitles_id, hl): + if self._captions_xml: + return + self._captions_xml = self._download_xml( + self._BASE_URL_CAPTIONS, video_id, query={ 'id': video_id, - 'vid': video_subtitles_id, + 'vid': subtitles_id, 'hl': hl, 'v': video_id, 'type': 'list', 'tlangs': '1', 'fmts': '1', 'vssids': '1', - }) - except ExtractorError as ee: - self.report_warning('unable to download video subtitles: %s' % error_to_compat_str(ee)) - if self._captions_by_country_xml is not None: - caption_available_extensions = self._captions_by_country_xml.findall('format') - for caption_extension in caption_available_extensions: - if caption_extension.attrib.get('fmt_code') and not caption_extension.attrib.get('default'): - self._caption_formats_ext.append(caption_extension.attrib['fmt_code']) + }, note='Downloading subtitles XML', + errnote='Unable to download subtitles XML', fatal=False) + if self._captions_xml: + for f in self._captions_xml.findall('format'): + if f.attrib.get('fmt_code') and not f.attrib.get('default'): + self._caption_formats_ext.append(f.attrib['fmt_code']) - def _get_captions_by_type(self, video_id, video_subtitles_id, caption_type, caption_original_lang_code=None): - if not video_subtitles_id or not caption_type: - return None + def _get_captions_by_type(self, video_id, subtitles_id, caption_type, + origin_lang_code=None): + if not subtitles_id or not caption_type: + return captions = {} - for caption_entry in self._captions_by_country_xml.findall(self._CAPTIONS_ENTRY_TAG[caption_type]): + for caption_entry in self._captions_xml.findall( + self._CAPTIONS_ENTRY_TAG[caption_type]): caption_lang_code = caption_entry.attrib.get('lang_code') if not caption_lang_code: continue caption_format_data = [] for caption_format in self._caption_formats_ext: query = { - 'vid': video_subtitles_id, + 'vid': subtitles_id, 'v': video_id, 'fmt': caption_format, - 'lang': caption_lang_code if caption_original_lang_code is None else caption_original_lang_code, + 'lang': (caption_lang_code if origin_lang_code is None + else origin_lang_code), 'type': 'track', 'name': '', 'kind': '', } - if caption_original_lang_code is not None: + if origin_lang_code is not None: query.update({'tlang': caption_lang_code}) caption_format_data.append({ 'url': update_url_query(self._BASE_URL_CAPTIONS, query), 'ext': caption_format, }) captions[caption_lang_code] = caption_format_data - if not captions: - self.report_warning('video doesn\'t have %s' % caption_type.replace('_', ' ')) return captions - def _get_subtitles(self, video_id, video_subtitles_id, hl): - if not video_subtitles_id or not hl: - return None - if self._captions_by_country_xml is None: - self._set_captions_data(video_id, video_subtitles_id, hl) - if self._captions_by_country_xml is None: - return None - return self._get_captions_by_type(video_id, video_subtitles_id, 'subtitles') + def _get_subtitles(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') - def _get_automatic_captions(self, video_id, video_subtitles_id, hl): - if not video_subtitles_id or not hl: - return None - if self._captions_by_country_xml is None: - self._set_captions_data(video_id, video_subtitles_id, hl) - if self._captions_by_country_xml is None: - return None - self.to_screen('%s: Looking for automatic captions' % video_id) - subtitle_original_track = self._captions_by_country_xml.find('track') - if subtitle_original_track is None: - return None - subtitle_original_lang_code = subtitle_original_track.attrib.get('lang_code') - if not subtitle_original_lang_code: - return None - return self._get_captions_by_type(video_id, video_subtitles_id, 'automatic_captions', subtitle_original_lang_code) + def _get_automatic_captions(self, video_id, subtitles_id, hl): + if not subtitles_id or not hl: + return + self._download_subtitles_xml(video_id, subtitles_id, hl) + if not self._captions_xml: + return + track = self._captions_xml.find('track') + if track is None: + return + origin_lang_code = track.attrib.get('lang_code') + if not origin_lang_code: + return + return self._get_captions_by_type( + video_id, subtitles_id, 'automatic_captions', origin_lang_code) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://docs.google.com/file/d/%s' % video_id, video_id) - reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + reason = self._search_regex( + r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) if reason: raise ExtractorError(reason) title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') duration = int_or_none(self._search_regex( - r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', + default=None)) fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') - fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, + 'fmt stream map').split(',') + fmt_list = self._search_regex( + r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') resolutions = {} for fmt in fmt_list: @@ -190,12 +192,14 @@ class GoogleDriveIE(InfoExtractor): hl = self._search_regex( r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) - video_subtitles_id = None + subtitles_id = None ttsurl = self._search_regex( r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) if ttsurl: - # the video Id for subtitles will be the last value in the ttsurl query string - video_subtitles_id = ttsurl.encode('utf-8').decode('unicode_escape').split('=')[-1] + # the video Id for subtitles will be the last value in the ttsurl + # query string + subtitles_id = ttsurl.encode('utf-8').decode( + 'unicode_escape').split('=')[-1] return { 'id': video_id, @@ -203,6 +207,7 @@ class GoogleDriveIE(InfoExtractor): 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, video_subtitles_id, hl), - 'automatic_captions': self.extract_automatic_captions(video_id, video_subtitles_id, hl), + 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), + 'automatic_captions': self.extract_automatic_captions( + video_id, subtitles_id, hl), } From 8d7a24aff60a57e651bab40f16a81eb7dffb405c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Aug 2017 22:28:09 +0700 Subject: [PATCH 106/238] [toutv] Relax DRM check (closes #13994) --- youtube_dl/extractor/toutv.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 26d770992..071388dcc 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -78,8 +78,10 @@ class TouTvIE(InfoExtractor): def _real_extract(self, url): path = self._match_id(url) metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) + # IsDrm does not necessarily mean the video is DRM protected (see + # https://github.com/rg3/youtube-dl/issues/13994). if metadata.get('IsDrm'): - raise ExtractorError('This video is DRM protected.', expected=True) + self.report_warning('This video is probably DRM protected.', path) video_id = metadata['IdMedia'] details = metadata['Details'] title = details['OriginalTitle'] From 0830f3e04842a58eb563962940ceb2bed27aac1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Aug 2017 22:45:45 +0700 Subject: [PATCH 107/238] [cbc:watch] Bypass geo-restriction (closes #13993) --- youtube_dl/extractor/cbc.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 87ad14e91..9faf40227 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -200,6 +200,7 @@ class CBCWatchBaseIE(InfoExtractor): 'media': 'http://search.yahoo.com/mrss/', 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } + _GEO_COUNTRIES = ['CA'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path @@ -287,6 +288,11 @@ class CBCWatchBaseIE(InfoExtractor): class CBCWatchVideoIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch:video' _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + # geo-restricted to Canada, bypassable + 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', + 'only_matching': True, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -323,9 +329,10 @@ class CBCWatchIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch' _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' _TESTS = [{ + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', 'info_dict': { - 'id': '38e815a-009e3ab12e4', + 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', 'ext': 'mp4', 'title': 'Customer (Dis)Service', 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', @@ -337,8 +344,8 @@ class CBCWatchIE(CBCWatchBaseIE): 'skip_download': True, 'format': 'bestvideo', }, - 'skip': 'Geo-restricted to Canada', }, { + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', 'info_dict': { 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', @@ -346,7 +353,6 @@ class CBCWatchIE(CBCWatchBaseIE): 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', }, 'playlist_mincount': 30, - 'skip': 'Geo-restricted to Canada', }] def _real_extract(self, url): From 5bae33485c223fdf230254fa424f972b3c51e77f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Aug 2017 22:50:00 +0700 Subject: [PATCH 108/238] [toutv] PEP 8 --- youtube_dl/extractor/toutv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 071388dcc..e59ed2661 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..utils import ( int_or_none, js_to_json, - ExtractorError, urlencode_postdata, extract_attributes, smuggle_url, From c4bdc6811307c002a399b59860d311591145397f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Aug 2017 23:21:19 +0700 Subject: [PATCH 109/238] [ChangeLog] Actualize --- ChangeLog | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index c07cb9648..4104b6ded 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,12 +1,23 @@ version Core ++ [extractor/common] Introduce _parse_xml +* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries + non fatal (#13970) * [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) Extractors +* [cbc:watch] Bypass geo restriction (#13993) +* [toutv] Relax DRM check (#13994) ++ [googledrive] Add support for subtitles (#13619, #13638) +* [pornhub] Relax uploader regular expression (#13906, #13975) +* [bandcamp:album] Extract track titles (#13962) ++ [bbccouk] Add support for events URLs (#13893) + [liveleak] Support multi-video pages (#6542) + [liveleak] Support another liveleak embedding pattern (#13336) * [cda] Fix extraction (#13935) ++ [laola1tv] Add support for tv.ittf.com (#13965) +* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003) version 2017.08.18 @@ -129,7 +140,7 @@ Extractors * [youku:show] Fix playlist extraction (#13248) + [dispeak] Recognize sevt subdomain (#13276) * [adn] Improve error reporting (#13663) -* [crunchyroll] Relax series and season regex (#13659) +* [crunchyroll] Relax series and season regular expression (#13659) + [spiegel:article] Add support for nexx iframe embeds (#13029) + [nexx:embed] Add support for iframe embeds * [nexx] Improve JS embed extraction From df235dbba8d8ae3b51ad3432f67d0cb661dadd75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Aug 2017 23:23:13 +0700 Subject: [PATCH 110/238] release 2017.08.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 12 ++++++------ ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 66dd4c480..3e1ff1536 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.18 +[debug] youtube-dl version 2017.08.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d606eab0e..a8091e7b5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,7 +3,7 @@ $ youtube-dl -v [debug] System config: [] [debug] User config: [] -[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] youtube-dl version 2015.12.06 [debug] Git HEAD: 135392e @@ -34,7 +34,7 @@ For bug reports, this means that your report should contain the *complete* outpu If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). -**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL. +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? @@ -70,7 +70,7 @@ It may sound strange, but some bug reports we receive are completely unrelated t # DEVELOPER INSTRUCTIONS -Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. +Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution. To run youtube-dl as a developer, you don't need to build anything either. Simply execute @@ -118,7 +118,7 @@ After you have ensured this site is distributing its content legally, you can fo class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { - 'url': 'http://yourextractor.com/watch/42', + 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', @@ -151,8 +151,8 @@ After you have ensured this site is distributing its content legally, you can fo 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py diff --git a/ChangeLog b/ChangeLog index 4104b6ded..a60bd5fc8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.08.23 Core + [extractor/common] Introduce _parse_xml diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1991975cc..dbec6c8dc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -363,6 +363,7 @@ - **IPrima** - **iqiyi**: 爱奇艺 - **Ir90Tv** + - **ITTF** - **ITV** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations @@ -419,6 +420,7 @@ - **limelight:channel_list** - **LiTV** - **LiveLeak** + - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4358cd3f2..94d35a66a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.18' +__version__ = '2017.08.23' From 745968bc72f5dcf1271559d58abd3f0d9a2ea01e Mon Sep 17 00:00:00 2001 From: Vijay Singh Date: Thu, 24 Aug 2017 20:58:44 +0530 Subject: [PATCH 111/238] [mixcloud] Fix extraction (closes #14015) --- youtube_dl/extractor/mixcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 798968ae3..25d3fc4b3 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -92,7 +92,7 @@ class MixcloudIE(InfoExtractor): js = self._download_webpage(js_url, track_id, fatal=False) if js: KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' - for key_name in ('value', 'key_value'): + for key_name in ('value', 'key_value', 'key_value_two'): key = self._search_regex( KEY_RE_TEMPLATE % key_name, js, 'key', default=None, group='key') From c7121fa7b804d2b9e35dec05f8861e4ebba8afd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Aug 2017 15:38:38 +0700 Subject: [PATCH 112/238] [youtube] Fix controversy videos extraction (closes #14027, closes #14029) --- youtube_dl/extractor/youtube.py | 36 ++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e71795e7..3d0f5a5c9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1003,6 +1003,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Skipping DASH manifest', ], }, + { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + 'license': 'Standard YouTube License', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { # itag 212 'url': '1t24XAntNCY', @@ -1437,9 +1458,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + # Get video info embed_webpage = None - is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -1509,6 +1535,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue get_video_info = compat_parse_qs(video_info_webpage) add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) if not video_info: video_info = get_video_info if 'token' in get_video_info: @@ -1592,10 +1620,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self.playlist_result(entries, video_id, video_title, video_description) self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - if 'view_count' in video_info: - view_count = int(video_info['view_count'][0]) - else: - view_count = None + if view_count is None: + view_count = extract_view_count(get_video_info) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: From 151978f38a915afd0169e68955b3b7394c54e367 Mon Sep 17 00:00:00 2001 From: Vijay Singh Date: Sat, 26 Aug 2017 18:02:57 +0530 Subject: [PATCH 113/238] [mixcloud] Fix extraction (closes #14020) --- youtube_dl/extractor/mixcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 25d3fc4b3..f6360cce6 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -92,7 +92,7 @@ class MixcloudIE(InfoExtractor): js = self._download_webpage(js_url, track_id, fatal=False) if js: KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' - for key_name in ('value', 'key_value', 'key_value_two'): + for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): key = self._search_regex( KEY_RE_TEMPLATE % key_name, js, 'key', default=None, group='key') From 085d9dd9bebfd1692cfe07e8bcb844780bfe4700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Aug 2017 22:02:49 +0700 Subject: [PATCH 114/238] [rai] Fix audio formats extraction (closes #14024) --- youtube_dl/extractor/rai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e11bf8f9a..5bf64a56b 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -345,11 +345,11 @@ class RaiIE(RaiBaseIE): media_type = media['type'] if 'Audio' in media_type: relinker_info = { - 'formats': { + 'formats': [{ 'format_id': media.get('formatoAudio'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), - } + }] } elif 'Video' in media_type: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) From a3c3a1e12898a57fc2323e4c7cc37ace35482ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Aug 2017 23:55:48 +0700 Subject: [PATCH 115/238] [http] Rework HTTP downloader * Simplify code and split into separate routines to facilitate maintaining * Make retry mechanism work on errors during actual download not only during connection establishment phase * Retry on ECONNRESET and ETIMEDOUT during reading data from network * Retry on content too short and various timeout errors * Show error description on retry * Closes #506, closes #809, closes #2849, closes #4240, closes #6023, closes #8625, closes #9483 --- youtube_dl/downloader/common.py | 6 +- youtube_dl/downloader/http.py | 343 ++++++++++++++++++-------------- 2 files changed, 197 insertions(+), 152 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 77242dacc..75b8166c5 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -304,11 +304,11 @@ class FileDownloader(object): """Report attempt to resume at given byte.""" self.to_screen('[download] Resuming download at byte %s' % resume_len) - def report_retry(self, count, retries): + def report_retry(self, err, count, retries): """Report retry in case of HTTP error 5xx""" self.to_screen( - '[download] Got server HTTP error. Retrying (attempt %d of %s)...' - % (count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' + % (error_to_compat_str(err), count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index af405b950..8a6638cc2 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -22,8 +22,16 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): url = info_dict['url'] - tmpfilename = self.temp_name(filename) - stream = None + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} @@ -38,46 +46,51 @@ class HttpFD(FileDownloader): if is_test: request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) - # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): - resume_len = os.path.getsize(encodeFilename(tmpfilename)) - else: - resume_len = 0 + ctx.open_mode = 'wb' + ctx.resume_len = 0 - open_mode = 'wb' - if resume_len != 0: - if self.params.get('continuedl', True): - self.report_resuming_byte(resume_len) - request.add_header('Range', 'bytes=%d-' % resume_len) - open_mode = 'ab' - else: - resume_len = 0 + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) count = 0 retries = self.params.get('retries', 0) - while count <= retries: + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + def establish_connection(): + if ctx.resume_len != 0: + self.report_resuming_byte(ctx.resume_len) + request.add_header('Range', 'bytes=%d-' % ctx.resume_len) + ctx.open_mode = 'ab' # Establish connection try: - data = self.ydl.urlopen(request) + ctx.data = self.ydl.urlopen(request) # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if resume_len > 0: - content_range = data.headers.get('Content-Range') + if ctx.resume_len > 0: + content_range = ctx.data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) # Content-Range is present and matches requested Range, resume is possible - if content_range_m and resume_len == int(content_range_m.group(1)): - break + if content_range_m and ctx.resume_len == int(content_range_m.group(1)): + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: # Unexpected HTTP error @@ -86,15 +99,15 @@ class HttpFD(FileDownloader): # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header - data = self.ydl.urlopen(basic_request) - content_length = data.info()['Content-Length'] + ctx.data = self.ydl.urlopen(basic_request) + content_length = ctx.data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: raise else: # Examine the reported length if (content_length is not None and - (resume_len - 100 < int(content_length) < resume_len + 100)): + (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -102,152 +115,184 @@ class HttpFD(FileDownloader): # I decided to implement a suggested change and consider the file # completely downloaded if the file size differs less than 100 bytes from # the one in the hard drive. - self.report_file_already_downloaded(filename) - self.try_rename(tmpfilename, filename) + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) self._hook_progress({ - 'filename': filename, + 'filename': ctx.filename, 'status': 'finished', - 'downloaded_bytes': resume_len, - 'total_bytes': resume_len, + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, }) - return True + raise SucceedDownload() else: # The length does not match, we start the download over self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break - except socket.error as e: - if e.errno != errno.ECONNRESET: + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: # Connection reset is no problem, just retry raise + raise RetryDownload(err) - # Retry - count += 1 - if count <= retries: - self.report_retry(count, retries) + def download(): + data_len = ctx.data.info().get('Content-length', None) - if count > retries: - self.report_error('giving up after %s retries' % retries) - return False + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE - data_len = data.info().get('Content-length', None) - - # Range HTTP header may be ignored/unsupported by a webserver - # (e.g. extractor/scivee.py, extractor/bambuser.py). - # However, for a test we still would like to download just a piece of a file. - # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control - # block size when downloading a file. - if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): - data_len = self._TEST_FILE_SIZE - - if data_len is not None: - data_len = int(data_len) + resume_len - min_data_len = self.params.get('min_filesize') - max_data_len = self.params.get('max_filesize') - if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) - return False - if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) - return False - - byte_counter = 0 + resume_len - block_size = self.params.get('buffersize', 1024) - start = time.time() - - # measure time over whole while-loop, so slow_down() and best_block_size() work together properly - now = None # needed for slow_down() in the first loop run - before = start # start measuring - while True: - - # Download and write - data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - byte_counter += len(data_block) - - # exit loop when download is finished - if len(data_block) == 0: - break - - # Open destination file just in time - if stream is None: - try: - (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) - assert stream is not None - filename = self.undo_temp_name(tmpfilename) - self.report_destination(filename) - except (OSError, IOError) as err: - self.report_error('unable to open for writing: %s' % str(err)) + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - if self.params.get('xattr_set_filesize', False) and data_len is not None: + byte_counter = 0 + ctx.resume_len + block_size = self.params.get('buffersize', 1024) + start = time.time() + + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring + + def retry(e): + if ctx.tmpfilename != '-': + ctx.stream.close() + ctx.stream = None + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) + + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): + raise + retry(e) + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: try: - write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) - except (XAttrUnavailableError, XAttrMetadataError) as err: - self.report_error('unable to set filesize xattr: %s' % str(err)) + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False - try: - stream.write(data_block) - except (IOError, OSError) as err: + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) + + try: + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) + return False + + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - start, + }) + + if is_test and byte_counter == data_len: + break + + if ctx.stream is None: self.to_stderr('\n') - self.report_error('unable to write data: %s' % str(err)) + self.report_error('Did not get any data blocks') return False + if ctx.tmpfilename != '-': + ctx.stream.close() - # Apply rate limit - self.slow_down(start, now, byte_counter - resume_len) + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err - # end measuring of one loop run - now = time.time() - after = now + self.try_rename(ctx.tmpfilename, ctx.filename) - # Adjust block size - if not self.params.get('noresizebuffer', False): - block_size = self.best_block_size(after - before, len(data_block)) - - before = after - - # Progress message - speed = self.calc_speed(start, now, byte_counter - resume_len) - if data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) self._hook_progress({ - 'status': 'downloading', 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'speed': speed, - 'elapsed': now - start, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - start, }) - if is_test and byte_counter == data_len: - break + return True - if stream is None: - self.to_stderr('\n') - self.report_error('Did not get any data blocks') - return False - if tmpfilename != '-': - stream.close() + while count <= retries: + try: + establish_connection() + download() + return True + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + continue + except SucceedDownload: + return True - if data_len is not None and byte_counter != data_len: - raise ContentTooShortError(byte_counter, int(data_len)) - self.try_rename(tmpfilename, filename) - - # Update file modification time - if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) - - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - start, - }) - - return True + self.report_error('giving up after %s retries' % retries) + return False From dd121cc1cab2f077cbf68ee432e83d4d094f0f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 03:12:56 +0700 Subject: [PATCH 116/238] [extractor/common] Extract height from res attribute of source tag for HTML5 videos (closes #14034) --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1804c4de0..b4af3f987 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2184,6 +2184,9 @@ class InfoExtractor(object): f = parse_content_type(source_attributes.get('type')) is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: + # res attribute is not standard but seen several times + # in the wild + f['height'] = int_or_none(source_attributes.get('res')) f.update(formats[0]) media_info['formats'].append(f) else: From 1ed4549942c34cce52b9c641cf9f532c38866149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 03:27:05 +0700 Subject: [PATCH 117/238] [extractor/common] Extract format id from label attribute of source tag for HTML5 videos (#14034) --- youtube_dl/extractor/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b4af3f987..74d30ec50 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2186,7 +2186,10 @@ class InfoExtractor(object): if is_plain_url: # res attribute is not standard but seen several times # in the wild - f['height'] = int_or_none(source_attributes.get('res')) + f.update({ + 'height': int_or_none(source_attributes.get('res')), + 'format_id': source_attributes.get('label'), + }) f.update(formats[0]) media_info['formats'].append(f) else: From ff17be3ac921910e5cab6f54a579f379dacae068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 03:27:20 +0700 Subject: [PATCH 118/238] [extractor/generic] Extract from LD-JSON last of all Previous sources may contain several formats, e.g. http://tamasha.com/v/PgGZ --- youtube_dl/extractor/generic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 49b00b87e..c81efdc00 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2871,12 +2871,6 @@ class GenericIE(InfoExtractor): merged[k] = v return merged - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - return merge_dicts(json_ld, info_dict) - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -2895,6 +2889,12 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False, base_url=url) return merge_dicts(info, info_dict) + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + return merge_dicts(json_ld, info_dict) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From 62c06c593d19c1314c84639128bb3354665399bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 04:24:41 +0700 Subject: [PATCH 119/238] [ChangeLog] Actualize --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index a60bd5fc8..46d76ff3d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +version + +Core ++ [extractor/common] Extract height and format id for HTML5 videos (#14034) +* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, + #8625, #9483) + * Simplify code and split into separate routines to facilitate maintaining + * Make retry mechanism work on errors during actual download not only + during connection establishment phase + * Retry on ECONNRESET and ETIMEDOUT during reading data from network + * Retry on content too short + * Show error description on retry + +Extractors +* [generic] Lower preference for extraction from LD-JSON +* [rai] Fix audio formats extraction (#14024) +* [youtube] Fix controversy videos extraction (#14027, #14029) +* [mixcloud] Fix extraction (#14015, #14020) + + version 2017.08.23 Core From f031b760650f57672deecacb3f1991bd6fcd778e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 04:28:04 +0700 Subject: [PATCH 120/238] release 2017.08.27 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 3e1ff1536..777932f93 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.27*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.27** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.23 +[debug] youtube-dl version 2017.08.27 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 46d76ff3d..a997759b0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.08.27 Core + [extractor/common] Extract height and format id for HTML5 videos (#14034) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 94d35a66a..6a24b70d7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.23' +__version__ = '2017.08.27' From 1c9c8de29e83bbfc38027dabcd9642f1c41a64b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 05:59:08 +0700 Subject: [PATCH 121/238] [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (closes #14037) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3d0f5a5c9..5a6b735a0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1621,7 +1621,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) if view_count is None: - view_count = extract_view_count(get_video_info) + view_count = extract_view_count(video_info) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: From cc0412ef915047233391c978fa1f8d4d1479ddfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 06:06:49 +0700 Subject: [PATCH 122/238] [ChangeLog] Actualize --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index a997759b0..c38a83670 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version + +Extractors + +* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) + + version 2017.08.27 Core From 2cfa7cbdd09353c8f8b135ba1c0b3a325bb03a16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 06:09:29 +0700 Subject: [PATCH 123/238] release 2017.08.27.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 777932f93..7959d910b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.27*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.27** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.27.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.27.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.27 +[debug] youtube-dl version 2017.08.27.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index c38a83670..ef9ac4660 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.08.27.1 Extractors diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6a24b70d7..7504ef0f0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.27' +__version__ = '2017.08.27.1' From 3902cdd0e3309364809407b895e4060199b89d11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 Aug 2017 22:36:57 +0700 Subject: [PATCH 124/238] [pornhd] Fix extraction (closes #14005) --- youtube_dl/extractor/pornhd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 36761788d..b52879c7a 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor): r'(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') sources = self._parse_json(js_to_json(self._search_regex( - r"(?s)sources'?\s*:\s*(\{.+?\})\s*\}[;,)]", + r"(?s)sources'?\s*[:=]\s*(\{.+?\})", webpage, 'sources', default='{}')), video_id) if not sources: @@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'(\d+) views\s*<', webpage, 'view count', fatal=False)) thumbnail = self._search_regex( - r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) + r"poster'?\s*:\s*([\"'])(?P(?:(?!\1).)+)\1", webpage, + 'thumbnail', fatal=False, group='url') return { 'id': video_id, From fea82c1780cc751267fd2b9b4145996bfc0c1994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 28 Aug 2017 00:39:22 +0700 Subject: [PATCH 125/238] [googledrive] Add support for source format (closes #14046) --- youtube_dl/extractor/googledrive.py | 121 +++++++++++++++++++--------- 1 file changed, 84 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 97ff28219..37d373901 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, int_or_none, lowercase_escape, @@ -15,17 +16,26 @@ class GoogleDriveIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28,})' _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'md5': 'd109872761f7e7ecf353fa108c0dbe1e', + 'md5': '5c602afbbf2c1db91831f5d82f678554', 'info_dict': { 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', 'duration': 45, } + }, { + # video can't be watched anonymously due to view count limit reached, + # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046) + 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', + 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', + 'info_dict': { + 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', + 'ext': 'mp4', + 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', + } }, { # video id is longer than 28 characters 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', - 'md5': 'c230c67252874fddd8170e3fd1a45886', 'info_dict': { 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', 'ext': 'mp4', @@ -147,47 +157,84 @@ class GoogleDriveIE(InfoExtractor): webpage = self._download_webpage( 'http://docs.google.com/file/d/%s' % video_id, video_id) - reason = self._search_regex( - r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) - if reason: - raise ExtractorError(reason) - - title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + title = self._search_regex( + r'"title"\s*,\s*"([^"]+)', webpage, 'title', + default=None) or self._og_search_title(webpage) duration = int_or_none(self._search_regex( r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) - fmt_stream_map = self._search_regex( - r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, - 'fmt stream map').split(',') - fmt_list = self._search_regex( - r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') - - resolutions = {} - for fmt in fmt_list: - mobj = re.search( - r'^(?P\d+)/(?P\d+)[xX](?P\d+)', fmt) - if mobj: - resolutions[mobj.group('format_id')] = ( - int(mobj.group('width')), int(mobj.group('height'))) formats = [] - for fmt_stream in fmt_stream_map: - fmt_stream_split = fmt_stream.split('|') - if len(fmt_stream_split) < 2: - continue - format_id, format_url = fmt_stream_split[:2] - f = { - 'url': lowercase_escape(format_url), - 'format_id': format_id, - 'ext': self._FORMATS_EXT[format_id], - } - resolution = resolutions.get(format_id) - if resolution: - f.update({ - 'width': resolution[0], - 'height': resolution[1], + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, + 'fmt stream map', default='').split(',') + fmt_list = self._search_regex( + r'"fmt_list"\s*,\s*"([^"]+)', webpage, + 'fmt_list', default='').split(',') + if fmt_stream_map and fmt_list: + resolutions = {} + for fmt in fmt_list: + mobj = re.search( + r'^(?P\d+)/(?P\d+)[xX](?P\d+)', fmt) + if mobj: + resolutions[mobj.group('format_id')] = ( + int(mobj.group('width')), int(mobj.group('height'))) + + for fmt_stream in fmt_stream_map: + fmt_stream_split = fmt_stream.split('|') + if len(fmt_stream_split) < 2: + continue + format_id, format_url = fmt_stream_split[:2] + f = { + 'url': lowercase_escape(format_url), + 'format_id': format_id, + 'ext': self._FORMATS_EXT[format_id], + } + resolution = resolutions.get(format_id) + if resolution: + f.update({ + 'width': resolution[0], + 'height': resolution[1], + }) + formats.append(f) + + source_url = update_url_query( + 'https://drive.google.com/uc', { + 'id': video_id, + 'export': 'download', + }) + urlh = self._request_webpage( + source_url, video_id, note='Requesting source file', + errnote='Unable to request source file', fatal=False) + if urlh: + def add_source_format(src_url): + formats.append({ + 'url': src_url, + 'ext': determine_ext(title, 'mp4').lower(), + 'format_id': 'source', + 'quality': 1, }) - formats.append(f) + if urlh.headers.get('Content-Disposition'): + add_source_format(source_url) + else: + confirmation_webpage = self._webpage_read_content( + urlh, url, video_id, note='Downloading confirmation page', + errnote='Unable to confirm download', fatal=False) + if confirmation_webpage: + confirm = self._search_regex( + r'confirm=([^&"\']+)', confirmation_webpage, + 'confirmation code', fatal=False) + if confirm: + add_source_format(update_url_query(source_url, { + 'confirm': confirm, + })) + + if not formats: + reason = self._search_regex( + r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason, expected=True) + self._sort_formats(formats) hl = self._search_regex( From 1b41da488df64bd41463a79691330e555f79ac2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 28 Aug 2017 00:50:41 +0700 Subject: [PATCH 126/238] [googledrive] Extend _VALID_URL (closes #9785) --- youtube_dl/extractor/googledrive.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 37d373901..3bf462d63 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -13,7 +13,18 @@ from ..utils import ( class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28,})' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:docs|drive)\.google\.com/ + (?: + (?:uc|open)\?.*?id=| + file/d/ + )| + video\.google\.com/get_player\?.*?docid= + ) + (?P[a-zA-Z0-9_-]{28,}) + ''' _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'md5': '5c602afbbf2c1db91831f5d82f678554', @@ -42,7 +53,13 @@ class GoogleDriveIE(InfoExtractor): 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', 'duration': 189, }, - 'only_matching': True + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, + }, { + 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', + 'only_matching': True, }] _FORMATS_EXT = { '5': 'flv', From c75c384fb6d64f8d44c10d798cb64c8c00c61175 Mon Sep 17 00:00:00 2001 From: Ryan Schmidt Date: Sun, 27 Aug 2017 18:07:09 -0500 Subject: [PATCH 127/238] Fix build failures with old cp and zip --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 41e3a683a..c74eea792 100644 --- a/Makefile +++ b/Makefile @@ -49,11 +49,11 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py mkdir -p zip for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \ mkdir -p zip/$$d ;\ - cp -a $$d/*.py zip/$$d/ ;\ + cp -pPR $$d/*.py zip/$$d/ ;\ done touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py mv zip/youtube_dl/__main__.py zip/ - cd zip ; zip --quiet ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py + cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py rm -rf zip echo '#!$(PYTHON)' > youtube-dl cat youtube-dl.zip >> youtube-dl From 53647dfd0ae5f3d369e01f04acd5eff9713cc91d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Aug 2017 05:27:56 +0700 Subject: [PATCH 128/238] [bbccouk] Add support for w-prefixed ids (closes #14056) --- youtube_dl/extractor/bbc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 911ae6780..8b20c03d6 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -29,7 +29,7 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pb][\da-z]{7}' + _ID_REGEX = r'[pbw][\da-z]{7}' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -233,6 +233,9 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' From 5b4bfbfc3bba2174234834a08a97e261e492a2f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Aug 2017 23:50:33 +0700 Subject: [PATCH 129/238] [charlierose] Add support for episodes (closes #14062) --- youtube_dl/extractor/charlierose.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index 2d517f231..42c9af263 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -5,7 +5,7 @@ from ..utils import remove_end class CharlieRoseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P\d+)' _TESTS = [{ 'url': 'https://charlierose.com/videos/27996', 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', @@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor): }, { 'url': 'https://charlierose.com/videos/27996', 'only_matching': True, + }, { + 'url': 'https://charlierose.com/episodes/30887?autoplay=true', + 'only_matching': True, }] _PLAYER_BASE = 'https://charlierose.com/video/player/%s' From 7998520933db70625ff3859c464fa4bbf2ff7fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Aug 2017 00:47:58 +0700 Subject: [PATCH 130/238] [youtube] Fix upload date extraction (closes #14065) --- youtube_dl/extractor/youtube.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5a6b735a0..563cf6274 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1665,10 +1665,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not upload_date: upload_date = self._search_regex( [r'(?s)id="eow-date.*?>(.*?)', - r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)'], + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) - if upload_date: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( From 8d81f3e36d3caaeabcdff99d3340d4075d30741e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Sep 2017 00:57:14 +0700 Subject: [PATCH 131/238] [youtube] Force old layout for each webpage (closes #14083) --- youtube_dl/extractor/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 563cf6274..953e38227 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -245,6 +246,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _download_webpage(self, *args, **kwargs): + kwargs.setdefault('query', {})['disable_polymer'] = 'true' + return super(YoutubeBaseInfoExtractor, self)._download_webpage( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -2052,7 +2058,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): | (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ From 8681ed7fc80a6b4b99e3a57adf5ec8d4177a601c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 01:04:22 +0700 Subject: [PATCH 132/238] [ChangeLog] Actualize --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index ef9ac4660..d89fb3d18 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version <unreleased> + +Extractors +* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, + #14077, #14079, #14082, #14083, #14094, #14095, #14096) +* [youtube] Fix upload date extraction (#14065) ++ [charlierose] Add support for episodes (#14062) ++ [bbccouk] Add support for w-prefixed ids (#14056) +* [googledrive] Extend URL regular expression (#9785) ++ [googledrive] Add support for source format (#14046) +* [pornhd] Fix extraction (#14005) + + version 2017.08.27.1 Extractors From a2022b0c406286aa3d5101f1c85e7d11453e89d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 01:08:32 +0700 Subject: [PATCH 133/238] release 2017.09.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7959d910b..bd9e21983 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.27.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.27.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.27.1 +[debug] youtube-dl version 2017.09.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d89fb3d18..c439c8ef9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.02 Extractors * [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7504ef0f0..60ed35de9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.27.1' +__version__ = '2017.09.02' From a3431e12249530aa7a6962e54bcdbfce190c4c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 15:33:54 +0700 Subject: [PATCH 134/238] [radiocanada] Skip unsupported platforms (closes #14100) --- youtube_dl/extractor/radiocanada.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 3b40002a8..6bbc2781c 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -59,6 +59,7 @@ class RadioCanadaIE(InfoExtractor): device_types.append('android') formats = [] + error = None # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file for device_type in device_types: @@ -84,8 +85,8 @@ class RadioCanadaIE(InfoExtractor): if not v_url: continue if v_url == 'null': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, xpath_text(v_data, 'message')), expected=True) + error = xpath_text(v_data, 'message') + continue ext = determine_ext(v_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -129,6 +130,9 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_f4m_formats( base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) + if not formats and error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) subtitles = {} From 64f0e30b93db804323a6db382d4ccdf1dac4e38b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 15:44:49 +0700 Subject: [PATCH 135/238] [viidea] Capture and output lecture error message (#14099) --- youtube_dl/extractor/viidea.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 4adcd1830..a0abbae60 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,12 +4,14 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, + compat_HTTPError, compat_str, + compat_urlparse, ) from ..utils import ( - parse_duration, + ExtractorError, js_to_json, + parse_duration, parse_iso8601, ) @@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor): base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json( - '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), - lecture_id)['lecture'][0] + try: + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json( + e.cause.read().decode('utf-8'), lecture_id) + raise ExtractorError(msg['detail'], expected=True) + raise lecture_info = { 'id': lecture_id, From 503115540d8f135dc944ae48e40ba78f36238867 Mon Sep 17 00:00:00 2001 From: dubber0 <rexa.mose@gmail.com> Date: Sat, 22 Jul 2017 21:32:51 +0200 Subject: [PATCH 136/238] [aliexpress:live] Add extractor --- youtube_dl/extractor/aliexpress.py | 40 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 41 insertions(+) create mode 100644 youtube_dl/extractor/aliexpress.py diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py new file mode 100644 index 000000000..3997213f8 --- /dev/null +++ b/youtube_dl/extractor/aliexpress.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +import re + +from .common import InfoExtractor +from ..utils import try_get, float_or_none +from ..compat import compat_str + + +class AliExpressLiveIE(InfoExtractor): + + _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>[0-9]{16})' + _TEST = { + 'url': 'https://live.aliexpress.com/live/2800002704436634', + 'md5': '7ac2bc46afdd18f0b45a0a340fc47ffe', + 'info_dict': { + 'id': '2800002704436634', + 'ext': 'm3u8', + 'title': 'CASIMA7.22', + 'uploader': 'CASIMA Official Store', + 'upload_date': '20170714', + 'timestamp': 1500027138, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + page = self._download_webpage(url, video_id) + run_params_json = self._search_regex(r'runParams = (.+)[\s+]var myCtl', page, 'runParams', flags=re.DOTALL) + run_params = self._parse_json(run_params_json, video_id) + + return { + 'id': video_id, + 'title': run_params['title'], + 'url': run_params['replyStreamUrl'], + 'uploader': try_get(run_params, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(try_get(run_params, lambda x: x['followBar']['createTime']) / 1000), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 17048fd6e..d335f9fff 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -45,6 +45,7 @@ from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( From 23b2df82c70a832e485aaf52befa26e27a904995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 16:04:36 +0700 Subject: [PATCH 137/238] [aliexpress:live] Fix issues (closes #13698, closes #13707) --- youtube_dl/extractor/aliexpress.py | 47 +++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py index 3997213f8..6f241e683 100644 --- a/youtube_dl/extractor/aliexpress.py +++ b/youtube_dl/extractor/aliexpress.py @@ -1,40 +1,53 @@ # coding: utf-8 from __future__ import unicode_literals - -import re - from .common import InfoExtractor -from ..utils import try_get, float_or_none from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, +) class AliExpressLiveIE(InfoExtractor): - - _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>[0-9]{16})' + _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)' _TEST = { 'url': 'https://live.aliexpress.com/live/2800002704436634', - 'md5': '7ac2bc46afdd18f0b45a0a340fc47ffe', + 'md5': 'e729e25d47c5e557f2630eaf99b740a5', 'info_dict': { 'id': '2800002704436634', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'CASIMA7.22', + 'thumbnail': r're:http://.*\.jpg', 'uploader': 'CASIMA Official Store', - 'upload_date': '20170714', - 'timestamp': 1500027138, + 'timestamp': 1500717600, + 'upload_date': '20170722', }, } def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) - run_params_json = self._search_regex(r'runParams = (.+)[\s+]var myCtl', page, 'runParams', flags=re.DOTALL) - run_params = self._parse_json(run_params_json, video_id) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var', + webpage, 'runParams'), + video_id) + + title = data['title'] + + formats = self._extract_m3u8_formats( + data['replyStreamUrl'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') return { 'id': video_id, - 'title': run_params['title'], - 'url': run_params['replyStreamUrl'], - 'uploader': try_get(run_params, lambda x: x['followBar']['name'], compat_str), - 'timestamp': float_or_none(try_get(run_params, lambda x: x['followBar']['createTime']) / 1000), + 'title': title, + 'thumbnail': data.get('coverUrl'), + 'uploader': try_get( + data, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(data.get('startTimeLong'), scale=1000), + 'formats': formats, } From 73602bcd0c254b735cc93ce5ffeca9e98228190e Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi <ishitatsuyuki@gmail.com> Date: Fri, 1 Sep 2017 17:08:24 +0900 Subject: [PATCH 138/238] [soundcloud] Fix download URL with private tracks --- youtube_dl/extractor/soundcloud.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 2e52e092b..23dcac803 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,8 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re from .common import ( InfoExtractor, @@ -17,7 +17,7 @@ from ..utils import ( ExtractorError, int_or_none, unified_strdate, -) + update_url_query) class SoundcloudIE(InfoExtractor): @@ -160,11 +160,13 @@ class SoundcloudIE(InfoExtractor): 'license': info.get('license'), } formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token is not None: + query['secret_token'] = secret_token if info.get('downloadable', False): # We can build a direct link to the song - format_url = ( - 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( - track_id, self._CLIENT_ID)) + format_url = update_url_query( + 'https://api.soundcloud.com/tracks/{0}/download'.format(track_id), query) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), @@ -176,10 +178,7 @@ class SoundcloudIE(InfoExtractor): # We have to retrieve the url format_dict = self._download_json( 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, - track_id, 'Downloading track url', query={ - 'client_id': self._CLIENT_ID, - 'secret_token': secret_token, - }) + track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): abr = int_or_none(self._search_regex( @@ -216,7 +215,7 @@ class SoundcloudIE(InfoExtractor): # cannot be always used, sometimes it can give an HTTP 404 error formats.append({ 'format_id': 'fallback', - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'url': update_url_query(info['stream_url'], query), 'ext': ext, }) From d7c7100e3d920512a11bf7c6fee21e26da7ffa73 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Sun, 3 Sep 2017 16:18:24 +0700 Subject: [PATCH 139/238] [soundcloud] Simplify and add test (closes #14093) --- youtube_dl/extractor/soundcloud.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 23dcac803..1c6799d57 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -17,7 +17,8 @@ from ..utils import ( ExtractorError, int_or_none, unified_strdate, - update_url_query) + update_url_query, +) class SoundcloudIE(InfoExtractor): @@ -120,6 +121,21 @@ class SoundcloudIE(InfoExtractor): 'license': 'cc-by-sa', }, }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'upload_date': '20170831', + 'duration': 7449, + 'license': 'all-rights-reserved', + }, + }, ] _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg' @@ -166,7 +182,7 @@ class SoundcloudIE(InfoExtractor): if info.get('downloadable', False): # We can build a direct link to the song format_url = update_url_query( - 'https://api.soundcloud.com/tracks/{0}/download'.format(track_id), query) + 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), From 0cbb841ba94c8d813ff81e817154c5491a796f20 Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Thu, 31 Aug 2017 12:56:37 +0200 Subject: [PATCH 140/238] [bpb] Fix extraction (closes #14043) --- youtube_dl/extractor/bpb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 9661ade4f..14bc0f77d 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,13 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - quality = video_info['quality'] video_url = video_info['src'] + quality = 'high' if re.search(r'_high\.', video_url) else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, From c1c1585b316995ca47b59e8dc1e3b463beb1c54a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 16:38:43 +0700 Subject: [PATCH 141/238] [bpb] Improve (closes #14086) --- youtube_dl/extractor/bpb.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 14bc0f77d..07833532e 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,18 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src\s*:\s*'https://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: - video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - video_url = video_info['src'] - quality = 'high' if re.search(r'_high\.', video_url) else 'low' + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, From 0b4a8eb3ac823c26b037eb368c114ce6d976c5c3 Mon Sep 17 00:00:00 2001 From: theychx <boomingzooming@gmail.com> Date: Mon, 28 Aug 2017 21:35:57 +0200 Subject: [PATCH 142/238] [vidme:user] Relax _VALID_URLs --- youtube_dl/extractor/vidme.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index a7971d72e..39b65ed2f 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -263,29 +263,43 @@ class VidmeListBaseIE(InfoExtractor): class VidmeUserIE(VidmeListBaseIE): IE_NAME = 'vidme:user' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)' _API_ITEM = 'list' _TITLE = 'Videos' - _TEST = { - 'url': 'https://vid.me/EFARCHIVE', + _TESTS = [{ + 'url': 'https://vid.me/MasakoX', 'info_dict': { - 'id': '3834632', - 'title': 'EFARCHIVE - %s' % _TITLE, + 'id': '16112341', + 'title': 'MasakoX - %s' % _TITLE, }, - 'playlist_mincount': 238, - } + 'playlist_mincount': 191, + }, { + 'url': 'https://vid.me/unsQuare_netWork', + 'info_dict': { + 'id': '16148757', + 'title': 'unsQuare_netWork - %s' % _TITLE, + }, + 'playlist_mincount': 73, + }] class VidmeUserLikesIE(VidmeListBaseIE): IE_NAME = 'vidme:user:likes' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes' _API_ITEM = 'likes' _TITLE = 'Likes' - _TEST = { + _TESTS = [{ 'url': 'https://vid.me/ErinAlexis/likes', 'info_dict': { 'id': '6483530', 'title': 'ErinAlexis - %s' % _TITLE, }, 'playlist_mincount': 415, - } + }, { + 'url': 'https://vid.me/Kaleidoscope-Ish/likes', + 'info_dict': { + 'id': '16908594', + 'title': 'Kaleidoscope-Ish - %s' % _TITLE, + }, + 'playlist_mincount': 43, + }] From bc35f075370ed1e67fe71c544e6243a2fc4fa430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 17:02:11 +0700 Subject: [PATCH 143/238] [vidme:user] Make tests only matching (closes #14054) --- youtube_dl/extractor/vidme.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 39b65ed2f..59adb2377 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -275,11 +275,7 @@ class VidmeUserIE(VidmeListBaseIE): 'playlist_mincount': 191, }, { 'url': 'https://vid.me/unsQuare_netWork', - 'info_dict': { - 'id': '16148757', - 'title': 'unsQuare_netWork - %s' % _TITLE, - }, - 'playlist_mincount': 73, + 'only_matching': True, }] @@ -297,9 +293,5 @@ class VidmeUserLikesIE(VidmeListBaseIE): 'playlist_mincount': 415, }, { 'url': 'https://vid.me/Kaleidoscope-Ish/likes', - 'info_dict': { - 'id': '16908594', - 'title': 'Kaleidoscope-Ish - %s' % _TITLE, - }, - 'playlist_mincount': 43, + 'only_matching': True, }] From e9b865267aaa90e3b9e1b0468d20a4df31e13393 Mon Sep 17 00:00:00 2001 From: John D <jdong1992@gmail.com> Date: Wed, 30 Aug 2017 00:14:43 -0700 Subject: [PATCH 144/238] [manyvids] Add support for preview videos (closes #14053) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/manyvids.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/manyvids.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d335f9fff..46a11f3ef 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -564,6 +564,7 @@ from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, ) +from .manyvids import ManyVidsIE from .matchtv import MatchTVIE from .mdr import MDRIE from .mediaset import MediasetIE diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py new file mode 100644 index 000000000..ea739ce3f --- /dev/null +++ b/youtube_dl/extractor/manyvids.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class ManyVidsIE(InfoExtractor): + _VALID_URL = r'https?://www.manyvids\.com/Video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', + 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'info_dict': { + 'id': '133957', + 'ext': 'mp4', + 'title': 'everthing about me', + + } + } + + def _real_extract(self, url): + formats = [] + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = compat_urllib_parse_unquote(self._search_regex( + r'data-video-filepath=\"(.+?)\"', webpage, 'video URL', default='')) + + title = self._html_search_regex(r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + formats.append({ + 'url': video_url + }) + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } From efc57145c10bdf22da9d8571c35ccd0404e3b7c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 17:30:02 +0700 Subject: [PATCH 145/238] [manyvids] Improve (closes #14059) --- youtube_dl/extractor/manyvids.py | 40 +++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index ea739ce3f..b94b3c2ab 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -2,35 +2,47 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..utils import int_or_none class ManyVidsIE(InfoExtractor): - _VALID_URL = r'https?://www.manyvids\.com/Video/(?P<id>[0-9]+)' + _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' _TEST = { 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', 'info_dict': { 'id': '133957', 'ext': 'mp4', - 'title': 'everthing about me', - - } + 'title': 'everthing about me (Preview)', + 'view_count': int, + 'like_count': int, + }, } def _real_extract(self, url): - formats = [] video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = compat_urllib_parse_unquote(self._search_regex( - r'data-video-filepath=\"(.+?)\"', webpage, 'video URL', default='')) - title = self._html_search_regex(r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') - formats.append({ - 'url': video_url - }) + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video URL', group='url') + + title = '%s (Preview)' % self._html_search_regex( + r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + + like_count = int_or_none(self._search_regex( + r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) + view_count = int_or_none(self._html_search_regex( + r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, + 'view count', default=None)) + return { 'id': video_id, 'title': title, - 'formats': formats, + 'view_count': view_count, + 'like_count': like_count, + 'formats': [{ + 'url': video_url, + }], } From 6348671c4a4e9f45cebc107ff4c148ef4970bb39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Sep 2017 23:08:07 +0700 Subject: [PATCH 146/238] [arte] Relax unavailability check (closes #14112) --- youtube_dl/extractor/arte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 02613cf5d..5cde90c5b 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -82,7 +82,7 @@ class ArteTVBaseIE(InfoExtractor): vsr = player_info['VSR'] - if not vsr and not player_info.get('VRU'): + if not vsr: raise ExtractorError( 'Video %s is not available' % player_info.get('VID') or video_id, expected=True) From 880fa66f4ffa9afcfce91b5ce39f05909050da67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Sep 2017 22:45:07 +0700 Subject: [PATCH 147/238] [redtube] Fix formats extraction (closes #14122) --- youtube_dl/extractor/redtube.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index c367a6ae7..f70a75256 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -62,7 +63,23 @@ class RedTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) - else: + medias = self._parse_json( + self._search_regex( + r'mediaDefinition\s*:\s*(\[.+?\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = media.get('videoUrl') + if not format_url or not isinstance(format_url, compat_str): + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') formats.append({'url': video_url}) @@ -73,7 +90,7 @@ class RedTubeIE(InfoExtractor): r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', webpage, 'upload date', fatal=False)) duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', webpage, 'view count', fatal=False)) From c5c9bf0c120d2c481124a0c3913b981cf061fb95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Sep 2017 23:31:34 +0700 Subject: [PATCH 148/238] [YoutubeDL] Ensure dir existence for each requested format (closes #14116) --- youtube_dl/YoutubeDL.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5f4c93ea3..4f208f1e1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1710,12 +1710,17 @@ class YoutubeDL(object): if filename is None: return - try: - dn = os.path.dirname(sanitize_path(encodeFilename(filename))) - if dn and not os.path.exists(dn): - os.makedirs(dn) - except (OSError, IOError) as err: - self.report_error('unable to create directory ' + error_to_compat_str(err)) + def ensure_dir_exists(path): + try: + dn = os.path.dirname(path) + if dn and not os.path.exists(dn): + os.makedirs(dn) + return True + except (OSError, IOError) as err: + self.report_error('unable to create directory ' + error_to_compat_str(err)) + return False + + if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): return if self.params.get('writedescription', False): @@ -1853,8 +1858,11 @@ class YoutubeDL(object): for f in requested_formats: new_info = dict(info_dict) new_info.update(f) - fname = self.prepare_filename(new_info) - fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + fname = prepend_extension( + self.prepare_filename(new_info), + 'f%s' % f['format_id'], new_info['ext']) + if not ensure_dir_exists(fname): + return downloaded.append(fname) partial_success = dl(fname, new_info) success = success and partial_success From 66c9fa36c10860b380806b9de48f38d628289e03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 6 Sep 2017 00:48:37 +0700 Subject: [PATCH 149/238] [youtube] Separate methods for embeds extraction --- youtube_dl/extractor/generic.py | 33 ++++------------------------- youtube_dl/extractor/youtube.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c81efdc00..b83c18380 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2243,36 +2243,11 @@ class GenericIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') - # Look for embedded YouTube player - matches = re.findall(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) - \1''', webpage) - if matches: + # Look for YouTube embeds + youtube_urls = YoutubeIE._extract_urls(webpage) + if youtube_urls: return self.playlist_from_matches( - matches, video_id, video_title, lambda m: unescapeHTML(m[1])) - - # Look for lazyYT YouTube embed - matches = re.findall( - r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) - - # Look for Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) + youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) matches = DailymotionIE._extract_urls(webpage) if matches: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 953e38227..ad2e933ee 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1374,6 +1374,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/.+?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) From 5113b6912467619bd463c5ebefe759d07078bea1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 6 Sep 2017 00:50:25 +0700 Subject: [PATCH 150/238] [abcnews,chilloutsoze,cracked,vice,vk] Use dedicated YouTube embeds extraction routines --- youtube_dl/extractor/abcnews.py | 7 +++---- youtube_dl/extractor/chilloutzone.py | 9 ++++----- youtube_dl/extractor/cracked.py | 7 +++---- youtube_dl/extractor/vice.py | 7 +++---- youtube_dl/extractor/vk.py | 7 +++---- 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 74d54560c..f770fe901 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -7,6 +7,7 @@ import time from .amp import AMPIE from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_urlparse @@ -108,9 +109,7 @@ class AbcNewsIE(InfoExtractor): r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') full_video_url = compat_urlparse.urljoin(url, video_url) - youtube_url = self._html_search_regex( - r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"', - webpage, 'YouTube URL', default=None) + youtube_url = YoutubeIE._extract_url(webpage) timestamp = None date_str = self._html_search_regex( @@ -140,7 +139,7 @@ class AbcNewsIE(InfoExtractor): } if youtube_url: - entries = [entry, self.url_result(youtube_url, 'Youtube')] + entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] return self.playlist_result(entries) return entry diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 0206d96db..d4769da75 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -5,6 +5,7 @@ import base64 import json from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( clean_html, ExtractorError @@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor): # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) if native_platform is None: - youtube_url = self._html_search_regex( - r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - webpage, 'fallback video URL', default=None) - if youtube_url is not None: - return self.url_result(youtube_url, ie='Youtube') + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or # the own CDN diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index 94d03ce2a..f77a68ece 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( parse_iso8601, str_to_int, @@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - youtube_url = self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', - webpage, 'youtube url', default=None) + youtube_url = YoutubeIE._extract_url(webpage) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) video_url = self._html_search_regex( [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 54e207b39..b8b8bf979 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -7,6 +7,7 @@ import hashlib import json from .adobepass import AdobePassIE +from .youtube import YoutubeIE from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -261,11 +262,9 @@ class ViceArticleIE(InfoExtractor): if embed_code: return _url_res('ooyala:%s' % embed_code, 'Ooyala') - youtube_url = self._html_search_regex( - r'<iframe[^>]+src="(.*youtube\.com/.*)"', - body, 'YouTube URL', default=None) + youtube_url = YoutubeIE._extract_url(body) if youtube_url: - return _url_res(youtube_url, 'Youtube') + return _url_res(youtube_url, YoutubeIE.ie_key()) video_url = self._html_search_regex( r'data-video-url="([^"]+)"', diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index dc2719cf9..105e172d5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -25,6 +25,7 @@ from ..utils import ( from .dailymotion import DailymotionIE from .pladform import PladformIE from .vimeo import VimeoIE +from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): @@ -345,11 +346,9 @@ class VKIE(VKBaseIE): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) - youtube_url = self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', - info_page, 'youtube iframe', default=None) + youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: From 931edb2ada89db2bf3596ce1ad5c4d808914c7ab Mon Sep 17 00:00:00 2001 From: Olivier Bilodeau <olivier@bottomlesspit.org> Date: Fri, 8 Sep 2017 10:53:24 -0400 Subject: [PATCH 151/238] [radiocanada] Add fallback for title extraction --- youtube_dl/extractor/radiocanada.py | 45 ++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 6bbc2781c..b952e59b4 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -20,20 +20,37 @@ from ..utils import ( class RadioCanadaIE(InfoExtractor): IE_NAME = 'radiocanada' _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' - _TEST = { - 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', - 'info_dict': { - 'id': '7184272', - 'ext': 'mp4', - 'title': 'Le parcours du tireur capté sur vidéo', - 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', - 'upload_date': '20141023', + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + ] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -145,7 +162,7 @@ class RadioCanadaIE(InfoExtractor): return { 'id': video_id, - 'title': get_meta('Title'), + 'title': get_meta('Title') or get_meta('AV-nomEmission'), 'description': get_meta('Description') or get_meta('ShortDescription'), 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), 'duration': int_or_none(get_meta('length')), From 51aee72d16eb844377a44c12e50dbb95cd4ced27 Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Fri, 8 Sep 2017 15:13:17 +0000 Subject: [PATCH 152/238] [README.md] Clarify how to run extractor specific test cases --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f5d00df3..28ee63f40 100644 --- a/README.md +++ b/README.md @@ -936,6 +936,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file python test/test_download.py nosetests +See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. + If you want to create a build of youtube-dl yourself, you'll need * python @@ -1003,7 +1005,7 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: From debed8d759e74507371758d2344ce5afe5e237c2 Mon Sep 17 00:00:00 2001 From: luceatnobis <wehrmeyer.martin@web.de> Date: Tue, 4 Jul 2017 11:26:02 +0200 Subject: [PATCH 153/238] [rutube:playlist] Add extractor (closes #13534) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rutube.py | 84 ++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 46a11f3ef..aefadc56f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -899,6 +899,7 @@ from .rutube import ( RutubeEmbedIE, RutubeMovieIE, RutubePersonIE, + RutubePlaylistIE, ) from .rutv import RUTVIE from .ruutu import RuutuIE diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 889fa7628..a6b17c0ef 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -7,10 +7,14 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, unified_strdate, + try_get, + int_or_none, ) @@ -42,8 +46,24 @@ class RutubeIE(InfoExtractor): }, { 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + parts = compat_urllib_parse_urlparse(url) + params = compat_parse_qs(parts.query) + + # see if URL without parameters is OK + res = super(RutubeIE, cls).suitable(url) + + if params: # we only allow pl_id parameter in the url + res = res and 'pl_id' in params and len(params) == 1 + + return res + @staticmethod def _extract_urls(webpage): return [mobj.group('url') for mobj in re.finditer( @@ -193,3 +213,67 @@ class RutubePersonIE(RutubeChannelIE): }] _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(InfoExtractor): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _TESTS = [{ + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'info_dict': { + 'id': '4252', + }, + 'playlist_count': 25, + }] + + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?(?:.+)?pl_id=(?P<id>\d+)' + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/source/%s/?page=%s' + + @staticmethod + def suitable(url): + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return params.get('pl_id') and int_or_none(params['pl_id'][0]) \ + and params.get('pl_type') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + return self._extract_playlist(playlist_id) + + def _extract_playlist(self, playlist_id): + entries = [] + for pagenum in itertools.count(1): + page_url = self._PAGE_TEMPLATE % (playlist_id, pagenum) + + # download_json will sent an accept: application/xml header + page = self._download_json(page_url, playlist_id, + "Downloading metadata for page %s" % pagenum, + headers={'Accept': 'application/json'}) + + if not page['results']: + break + + results = page['results'] + for result in results: + entry = self.url_result(result.get('video_url'), 'Rutube') + category = try_get(result, lambda x: x['category']['name']) + entry.update({ + 'id': result.get('id'), + 'uploader': try_get(result, lambda x: x['author']['name']), + 'uploader_id': try_get(result, lambda x: x['author']['id']), + 'upload_date': unified_strdate(result.get('created_ts')), + 'title': result.get('title'), + 'description': result.get('description'), + 'thumbnail': result.get('thumbnail_url'), + 'duration': int_or_none(result.get('duration')), + 'category': [category] if category else None, + 'age_limit': 18 if result.get('is_adult') else 0, + 'view_count': int_or_none(result.get('hits')), + 'is_live': result.get('is_livestream'), + 'webpage_url': result.get('video_url'), + }) + entries.append(entry) + + if page['has_next'] is False: + break + + return self.playlist_result(entries, playlist_id, page['name']) From 48b813748d91acc7e9efc15075079a03faea18ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 18:39:13 +0700 Subject: [PATCH 154/238] [rutube] Rework and generalize playlist extractors (closes #13565) --- youtube_dl/extractor/rutube.py | 216 ++++++++++++++++----------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index a6b17c0ef..5a184879f 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -12,34 +12,60 @@ from ..compat import ( ) from ..utils import ( determine_ext, - unified_strdate, + unified_timestamp, try_get, int_or_none, ) -class RutubeIE(InfoExtractor): +class RutubeBaseIE(InfoExtractor): + def _extract_video(self, video, video_id=None, require_title=True): + title = video['title'] if require_title else video.get('title') + + age_limit = video.get('is_adult') + if age_limit is not None: + age_limit = 18 if age_limit is True else 0 + + uploader_id = try_get(video, lambda x: x['author']['id']) + category = try_get(video, lambda x: x['category']['name']) + + return { + 'id': video.get('id') or video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video.get('duration')), + 'uploader': try_get(video, lambda x: x['author']['name']), + 'uploader_id': compat_str(uploader_id) if uploader_id else None, + 'timestamp': unified_timestamp(video.get('created_ts')), + 'category': [category] if category else None, + 'age_limit': age_limit, + 'view_count': int_or_none(video.get('hits')), + 'comment_count': int_or_none(video.get('comments_count')), + 'is_live': video.get('is_livestream'), + } + + +class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '79938ade01294ef7e27574890d0d3769', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', 'duration': 80, 'uploader': 'NTDRussian', 'uploader_id': '29790', + 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, }, - 'params': { - # It requires ffmpeg (m3u8 download) - 'skip_download': True, - }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, @@ -49,20 +75,14 @@ class RutubeIE(InfoExtractor): }, { 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, }] @classmethod def suitable(cls, url): - parts = compat_urllib_parse_urlparse(url) - params = compat_parse_qs(parts.query) - - # see if URL without parameters is OK - res = super(RutubeIE, cls).suitable(url) - - if params: # we only allow pl_id parameter in the url - res = res and 'pl_id' in params and len(params) == 1 - - return res + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) @staticmethod def _extract_urls(webpage): @@ -72,12 +92,12 @@ class RutubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + video = self._download_json( 'http://rutube.ru/api/video/%s/?format=json' % video_id, video_id, 'Downloading video JSON') - # Some videos don't have the author field - author = video.get('author') or {} + info = self._extract_video(video, video_id) options = self._download_json( 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, @@ -99,19 +119,8 @@ class RutubeIE(InfoExtractor): }) self._sort_formats(formats) - return { - 'id': video['id'], - 'title': video['title'], - 'description': video['description'], - 'duration': video['duration'], - 'view_count': video['hits'], - 'formats': formats, - 'thumbnail': video['thumbnail_url'], - 'uploader': author.get('name'), - 'uploader_id': compat_str(author['id']) if author else None, - 'upload_date': unified_strdate(video['created_ts']), - 'age_limit': 18 if video['is_adult'] else 0, - } + info['formats'] = formats + return info class RutubeEmbedIE(InfoExtractor): @@ -123,7 +132,8 @@ class RutubeEmbedIE(InfoExtractor): 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', - 'ext': 'mp4', + 'ext': 'flv', + 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', @@ -131,7 +141,7 @@ class RutubeEmbedIE(InfoExtractor): 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', }, 'params': { - 'skip_download': 'Requires ffmpeg', + 'skip_download': True, }, }, { 'url': 'http://rutube.ru/play/embed/8083783', @@ -145,10 +155,51 @@ class RutubeEmbedIE(InfoExtractor): canonical_url = self._html_search_regex( r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, 'Canonical URL') - return self.url_result(canonical_url, 'Rutube') + return self.url_result(canonical_url, RutubeIE.ie_key()) -class RutubeChannelIE(InfoExtractor): +class RutubePlaylistBaseIE(RutubeBaseIE): + def _next_page_url(self, page_num, playlist_id, *args, **kwargs): + return self._PAGE_TEMPLATE % (playlist_id, page_num) + + def _entries(self, playlist_id, *args, **kwargs): + next_page_url = None + for pagenum in itertools.count(1): + page = self._download_json( + next_page_url or self._next_page_url( + pagenum, playlist_id, *args, **kwargs), + playlist_id, 'Downloading page %s' % pagenum) + + results = page.get('results') + if not results or not isinstance(results, list): + break + + for result in results: + video_url = result.get('video_url') + if not video_url or not isinstance(video_url, compat_str): + continue + entry = self._extract_video(result, require_title=False) + entry.update({ + '_type': 'url', + 'url': video_url, + 'ie_key': RutubeIE.ie_key(), + }) + yield entry + + next_page_url = page.get('next') + if not next_page_url or not page.get('has_next'): + break + + def _extract_playlist(self, playlist_id, *args, **kwargs): + return self.playlist_result( + self._entries(playlist_id, *args, **kwargs), + playlist_id, kwargs.get('playlist_name')) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class RutubeChannelIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' @@ -162,27 +213,8 @@ class RutubeChannelIE(InfoExtractor): _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' - def _extract_videos(self, channel_id, channel_title=None): - entries = [] - for pagenum in itertools.count(1): - page = self._download_json( - self._PAGE_TEMPLATE % (channel_id, pagenum), - channel_id, 'Downloading page %s' % pagenum) - results = page['results'] - if not results: - break - entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) - if not page['has_next']: - break - return self.playlist_result(entries, channel_id, channel_title) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id) - - -class RutubeMovieIE(RutubeChannelIE): +class RutubeMovieIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' @@ -196,11 +228,11 @@ class RutubeMovieIE(RutubeChannelIE): movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') - movie_name = movie['name'] - return self._extract_videos(movie_id, movie_name) + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) -class RutubePersonIE(RutubeChannelIE): +class RutubePersonIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' @@ -215,65 +247,33 @@ class RutubePersonIE(RutubeChannelIE): _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' -class RutubePlaylistIE(InfoExtractor): +class RutubePlaylistIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:playlist' IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' _TESTS = [{ - 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', 'info_dict': { - 'id': '4252', + 'id': '3097', }, - 'playlist_count': 25, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, }] - _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?(?:.+)?pl_id=(?P<id>\d+)' - _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/source/%s/?page=%s' + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' @staticmethod def suitable(url): params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - return params.get('pl_id') and int_or_none(params['pl_id'][0]) \ - and params.get('pl_type') + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) def _real_extract(self, url): - playlist_id = self._match_id(url) - return self._extract_playlist(playlist_id) - - def _extract_playlist(self, playlist_id): - entries = [] - for pagenum in itertools.count(1): - page_url = self._PAGE_TEMPLATE % (playlist_id, pagenum) - - # download_json will sent an accept: application/xml header - page = self._download_json(page_url, playlist_id, - "Downloading metadata for page %s" % pagenum, - headers={'Accept': 'application/json'}) - - if not page['results']: - break - - results = page['results'] - for result in results: - entry = self.url_result(result.get('video_url'), 'Rutube') - category = try_get(result, lambda x: x['category']['name']) - entry.update({ - 'id': result.get('id'), - 'uploader': try_get(result, lambda x: x['author']['name']), - 'uploader_id': try_get(result, lambda x: x['author']['id']), - 'upload_date': unified_strdate(result.get('created_ts')), - 'title': result.get('title'), - 'description': result.get('description'), - 'thumbnail': result.get('thumbnail_url'), - 'duration': int_or_none(result.get('duration')), - 'category': [category] if category else None, - 'age_limit': 18 if result.get('is_adult') else 0, - 'view_count': int_or_none(result.get('hits')), - 'is_live': result.get('is_livestream'), - 'webpage_url': result.get('video_url'), - }) - entries.append(entry) - - if page['has_next'] is False: - break - - return self.playlist_result(entries, playlist_id, page['name']) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) From c7e327c4d46a9b72f3f707710194dccf6eee50d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 19:08:39 +0700 Subject: [PATCH 155/238] [utils] Introduce bool_or_none --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2554a2abd..c42dd4c3a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1815,6 +1815,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + def strip_or_none(v): return None if v is None else v.strip() From c3dd44e08577c2ae0d08951037db5d1db7a321c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 19:09:27 +0700 Subject: [PATCH 156/238] [rutube] Use bool_or_none --- youtube_dl/extractor/rutube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 5a184879f..828c03b48 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -12,9 +12,10 @@ from ..compat import ( ) from ..utils import ( determine_ext, - unified_timestamp, - try_get, + bool_or_none, int_or_none, + try_get, + unified_timestamp, ) @@ -42,7 +43,7 @@ class RutubeBaseIE(InfoExtractor): 'age_limit': age_limit, 'view_count': int_or_none(video.get('hits')), 'comment_count': int_or_none(video.get('comments_count')), - 'is_live': video.get('is_livestream'), + 'is_live': bool_or_none(video.get('is_livestream')), } From bf6ec2fea9087235c14df2a079620fcc2c17b5eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 22:08:32 +0700 Subject: [PATCH 157/238] [fox] Fix extraction (#14147) --- youtube_dl/extractor/fox.py | 121 ++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 159fdf9c4..facc665f6 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -3,56 +3,99 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - smuggle_url, - update_url_query, + int_or_none, + parse_age_limit, + parse_duration, + try_get, + unified_timestamp, ) class FOXIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.fox.com/watch/255180355939/7684182528', + _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' + _TESTS = [{ + # clip + 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', 'md5': 'ebd296fcc41dd4b19f8115d8461a3165', 'info_dict': { - 'id': '255180355939', + 'id': '4b765a60490325103ea69888fb2bd4e8', 'ext': 'mp4', - 'title': 'Official Trailer: Gotham', - 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', - 'duration': 129, - 'timestamp': 1400020798, - 'upload_date': '20140513', - 'uploader': 'NEWA-FNG-FOXCOM', + 'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'description': 'md5:549cd9c70d413adb32ce2a779b53b486', + 'duration': 102, + 'timestamp': 1504291893, + 'upload_date': '20170901', + 'creator': 'FOX', + 'series': 'Gotham', }, - 'add_ie': ['ThePlatform'], - } + 'params': { + 'skip_download': True, + }, + }, { + # episode, geo-restricted + 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', + 'only_matching': True, + }, { + # episode, geo-restricted, tv provided required + 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), video_id) - fox_pdk_player = settings['fox_pdk_player'] - release_url = fox_pdk_player['release_url'] - query = { - 'mbr': 'true', - 'switch': 'http' - } - if fox_pdk_player.get('access') == 'locked': - ap_p = settings['foxAdobePassProvider'] - rating = ap_p.get('videoRating') - if rating == 'n/a': - rating = None - resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) + video = self._download_json( + 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, + video_id, headers={ + 'apikey': 'abdcbed02c124d393b39e818a4312055', + 'Content-Type': 'application/json', + 'Referer': url, + }) - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + title = video['name'] + + m3u8_url = self._download_json( + video['videoRelease']['url'], video_id)['playURL'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = video.get('description') + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( + video.get('duration')) or parse_duration(video.get('duration')) + timestamp = unified_timestamp(video.get('datePublished')) + age_limit = parse_age_limit(video.get('contentRating')) + + data = try_get( + video, lambda x: x['trackingData']['properties'], dict) or {} + + creator = data.get('brand') or data.get('network') or video.get('network') + + series = video.get('seriesName') or data.get( + 'seriesName') or data.get('show') + season_number = int_or_none(video.get('seasonNumber')) + episode = video.get('name') + episode_number = int_or_none(video.get('episodeNumber')) + release_year = int_or_none(video.get('releaseYear')) + + if data.get('authRequired'): + # TODO: AP + pass + + return { 'id': video_id, - }) - - return info + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'creator': creator, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'release_year': release_year, + 'formats': formats, + } From b98339b54b1de517def970a955cbbdda3e1d4874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 22:15:55 +0700 Subject: [PATCH 158/238] [ChangeLog] Actualize --- ChangeLog | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ChangeLog b/ChangeLog index c439c8ef9..86b36c37e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version <unreleased> + +Core ++ [utils] Introduce bool_or_none +* [YoutubeDL] Ensure dir existence for each requested format (#14116) + +Extractors +* [fox] Fix extraction (#14147) +* [rutube] Use bool_or_none +* [rutube] Rework and generalize playlist extractors (#13565) ++ [rutube:playlist] Add support for playlists (#13534, #13565) ++ [radiocanada] Add fallback for title extraction (#14145) +* [vk] Use dedicated YouTube embeds extraction routine +* [vice] Use dedicated YouTube embeds extraction routine +* [cracked] Use dedicated YouTube embeds extraction routine +* [chilloutzone] Use dedicated YouTube embeds extraction routine +* [abcnews] Use dedicated YouTube embeds extraction routine +* [youtube] Separate methods for embeds extraction +* [redtube] Fix formats extraction (#14122) +* [arte] Relax unavailability check (#14112) ++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) +* [vidme:user] Relax URL regular expression (#14054) +* [bpb] Fix extraction (#14043, #14086) +* [soundcloud] Fix download URL with private tracks (#14093) +* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) +* [viidea] Capture and output lecture error message (#14099) +* [radiocanada] Skip unsupported platforms (#14100) + + version 2017.09.02 Extractors From 806498cf2f35cc98cf0e6c5b46f58ca357a842de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 22:16:55 +0700 Subject: [PATCH 159/238] release 2017.09.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 4 +++- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index bd9e21983..fb934d4ba 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.09.02 +[debug] youtube-dl version 2017.09.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8091e7b5..333acee80 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -82,6 +82,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file python test/test_download.py nosetests +See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. + If you want to create a build of youtube-dl yourself, you'll need * python @@ -149,7 +151,7 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: diff --git a/ChangeLog b/ChangeLog index 86b36c37e..99667e5e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.10 Core + [utils] Introduce bool_or_none diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dbec6c8dc..798a81d3c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -38,6 +38,7 @@ - **afreecatv**: afreecatv.com - **afreecatv:global**: afreecatv.com - **AirMozilla** + - **AliExpressLive** - **AlJazeera** - **Allocine** - **AlphaPorno** @@ -437,6 +438,7 @@ - **MakerTV** - **mangomolo:live** - **mangomolo:video** + - **ManyVids** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** @@ -701,6 +703,7 @@ - **rutube:embed**: Rutube embedded videos - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos + - **rutube:playlist**: Rutube playlists - **RUTV**: RUTV.RU - **Ruutu** - **Ruv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 60ed35de9..736f753df 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.09.02' +__version__ = '2017.09.10' From f12a6e88b2c2632b10c156eb94d91675327485f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 03:22:27 +0700 Subject: [PATCH 160/238] [rutube:playlist] Fix suitable (closes #14166) --- youtube_dl/extractor/rutube.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 828c03b48..89d89b65a 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -265,8 +265,10 @@ class RutubePlaylistIE(RutubePlaylistBaseIE): _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' - @staticmethod - def suitable(url): + @classmethod + def suitable(cls, url): + if not super(RutubePlaylistIE, cls).suitable(url): + return False params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) From 43df248f10548e3c43f0f02584a360136f1129d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 03:27:43 +0700 Subject: [PATCH 161/238] [ChangeLog] Actualize --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index 99667e5e2..189276408 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [rutube:playlist] Fix suitable (#14166) + + version 2017.09.10 Core From 7dacceae75d3c513f442cfd20d778a31bb35d3d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 03:30:33 +0700 Subject: [PATCH 162/238] release 2017.09.11 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index fb934d4ba..f40cb2c4e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.11** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.09.10 +[debug] youtube-dl version 2017.09.11 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 189276408..c286da6c6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.11 Extractors * [rutube:playlist] Fix suitable (#14166) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 736f753df..cdcb32e06 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.09.10' +__version__ = '2017.09.11' From 2709d9fa28155f7abc84d3b57ce4491391d185ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 04:14:54 +0700 Subject: [PATCH 163/238] [animeondemand] Add support for flash videos (closes #9944) --- youtube_dl/extractor/animeondemand.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9e28f2579..c22530778 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -46,6 +46,10 @@ class AnimeOnDemandIE(InfoExtractor): # Full length film, non-series, ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/185', 'only_matching': True, + }, { + # Flash videos + 'url': 'https://www.anime-on-demand.de/anime/12', + 'only_matching': True, }] def _login(self): @@ -120,10 +124,11 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): + r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html): attributes = extract_attributes(input_) + title = attributes.get('data-dialog-header') playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist'): + for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): playlist_url = attributes.get(playlist_key) if isinstance(playlist_url, compat_str) and re.match( r'/?[\da-zA-Z]+', playlist_url): @@ -160,6 +165,23 @@ class AnimeOnDemandIE(InfoExtractor): fatal=False) if not playlist: continue + stream_url = playlist.get('streamurl') + if stream_url: + rtmp = re.search( + r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', + stream_url) + if rtmp: + formats.append({ + 'url': rtmp.group('url'), + 'app': rtmp.group('app'), + 'play_path': rtmp.group('playpath'), + 'page_url': url, + 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', + 'rtmp_real_time': True, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + continue start_video = playlist.get('startvideo', 0) playlist = playlist.get('playlist') if not playlist or not isinstance(playlist, list): From 018cc61549f417cf1e88af46ff68a17b75e62630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 04:22:55 +0700 Subject: [PATCH 164/238] [animeondemand] Bypass geo restriction --- youtube_dl/extractor/animeondemand.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index c22530778..2a1cd6520 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -21,6 +21,8 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' + # German-speaking countries of Europe + _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] _TESTS = [{ # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', From 2f483758bc6a6661f1215c38161ee626d90ab655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 04:32:35 +0700 Subject: [PATCH 165/238] [animeondemand] Improve and modernize --- youtube_dl/extractor/animeondemand.py | 34 +++++++++++++-------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 2a1cd6520..69d363311 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,16 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, ExtractorError, - sanitized_Request, urlencode_postdata, + urljoin, ) @@ -78,14 +75,13 @@ class AnimeOnDemandIE(InfoExtractor): 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', self._LOGIN_URL) + post_url = urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), headers={ + 'Referer': self._LOGIN_URL, + }) if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( @@ -154,17 +150,19 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), + item_id_list = [] + if format_id: + item_id_list.append(format_id) + item_id_list.append('videomaterial') + playlist = self._download_json( + urljoin(url, playlist_url), video_id, + 'Downloading %s JSON' % ' '.join(item_id_list), headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-CSRF-Token': csrf_token, 'Referer': url, 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) - playlist = self._download_json( - request, video_id, 'Downloading %s playlist JSON' % format_id, - fatal=False) + }, fatal=False) if not playlist: continue stream_url = playlist.get('streamurl') @@ -246,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor): f.update({ 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), + 'url': urljoin(url, m.group('href')), }) entries.append(f) From e7c3e33456e155106ad08347d8ab9a2ecd0c8a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Sep 2017 23:19:53 +0700 Subject: [PATCH 166/238] [downloader/fragment] Restart inconsistent incomplete fragment downloads (#13731) --- youtube_dl/downloader/fragment.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bccc8ecc1..6f6fb4a77 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -151,10 +151,15 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) + if ctx['fragment_index'] > 0 and resume_len == 0: + self.report_error( + 'Inconsistent state of incomplete fragment download. ' + 'Restarting from the beginning...') + ctx['fragment_index'] = resume_len = 0 + self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) - if ctx['fragment_index'] > 0: - assert resume_len > 0 + assert ctx['fragment_index'] == 0 dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) From 319fc70676fea19b71437aab4078d4d72cc1ba5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Sep 2017 23:50:19 +0700 Subject: [PATCH 167/238] [tv4] Relax _VALID_URL (closes #14206) --- youtube_dl/extractor/tv4.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 7aeb2c620..6487039e5 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -18,7 +18,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^\?]+)\?video_id=| + (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -63,6 +63,10 @@ class TV4IE(InfoExtractor): 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', 'only_matching': True, }, + { + 'url': ' http://www.tv4play.se/program/farang/3922081', + 'only_matching': True, + } ] def _real_extract(self, url): From 0732a90579091ad60b124fd693bdda8ee526e305 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 14 Sep 2017 20:37:46 +0200 Subject: [PATCH 168/238] [orf] Add new extractor for f4m stories --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/orf.py | 116 +++++++++++++++++++++++++++-- 2 files changed, 112 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aefadc56f..a3a97e940 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -768,6 +768,7 @@ from .ora import OraTVIE from .orf import ( ORFTVthekIE, ORFFM4IE, + ORFFM4StoryIE, ORFOE1IE, ORFIPTVIE, ) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index cc296eabd..74fe8017e 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -6,14 +6,15 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - HEADRequest, - unified_strdate, - strip_jsonp, - int_or_none, - float_or_none, determine_ext, + float_or_none, + HEADRequest, + int_or_none, + orderedSet, remove_end, + strip_jsonp, unescapeHTML, + unified_strdate, ) @@ -307,3 +308,108 @@ class ORFIPTVIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, } + + +class ORFFM4StoryIE(InfoExtractor): + IE_NAME = 'orf:fm4:story' + IE_DESC = 'fm4.orf.at stories' + _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' + + _TEST = { + 'url': 'http://fm4.orf.at/stories/2865738/', + 'playlist': [{ + 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', + 'info_dict': { + 'id': '547792', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + 'duration': 1748.52, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + }, + }, { + 'md5': 'c6dd2179731f86f4f55a7b49899d515f', + 'info_dict': { + 'id': '547798', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live (2)', + 'duration': 1504.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + }, + }], + } + + def _real_extract(self, url): + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + + entries = [] + all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) + for idx, video_id in enumerate(all_ids): + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['q8c'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') + if idx >= 1: + # Titles are duplicates, make them unique + title += ' (' + str(idx + 1) + ')' + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + entries.append({ + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + + return self.playlist_result(entries) From fad9fc537d8ab5141f402f6ccdf60161a8d0302a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 14 Sep 2017 20:47:23 +0200 Subject: [PATCH 169/238] [tv4] fix a test URL --- youtube_dl/extractor/tv4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 6487039e5..cfcce020a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -64,7 +64,7 @@ class TV4IE(InfoExtractor): 'only_matching': True, }, { - 'url': ' http://www.tv4play.se/program/farang/3922081', + 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, } ] From c46680fb2a0ee61faa25863d3c10b7b098cdbe67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 01:59:47 +0700 Subject: [PATCH 170/238] [condenast] Fix extraction (closes #14196, closes #14207) --- youtube_dl/extractor/condenast.py | 54 ++++++++++++++++++------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 0c3f0c0e4..ed278fefc 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -116,16 +116,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video_params(self, webpage): - query = {} - params = self._search_regex( - r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) - if params: - query.update({ - 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), - 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), - 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), - }) + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) else: params = extract_attributes(self._search_regex( r'(<[^>]+data-js="video-player"[^>]+>)', @@ -141,17 +141,27 @@ class CondeNastIE(InfoExtractor): video_id = params['videoId'] video_info = None - if params.get('playerId'): - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=params) - if info_page: - video_info = info_page.get('video') - if not video_info: - info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=params) - else: + + # New API path + query = params.copy() + query['embedType'] = 'inline' + info_page = self._download_json( + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + if not video_info: info_page = self._download_webpage( 'https://player.cnevids.com/inline/video/%s.js' % video_id, video_id, 'Downloading inline info', query={ @@ -215,7 +225,7 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage) + params = self._extract_video_params(webpage, display_id) info = self._search_json_ld( webpage, display_id, fatal=False) info.update(self._extract_video(params)) From 86e55e317cb70f07792cfe543186ad520cbe3230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 21:45:18 +0700 Subject: [PATCH 171/238] [ChangeLog] Actualize --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index c286da6c6..38d511362 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version <unreleased> + +Core +* [downloader/fragment] Restart inconsistent incomplete fragment downloads + (#13731) +* [YoutubeDL] Download raw subtitles files (#12909, #14191) + +Extractors +* [condenast] Fix extraction (#14196, #14207) ++ [orf] Add support for f4m stories +* [tv4] Relax URL regular expression (#14206) +* [animeondemand] Bypass geo restriction ++ [animeondemand] Add support for flash videos (#9944) + + version 2017.09.11 Extractors From 159d304a9fa2b91d91a60fe3bdf2211a59bcf346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 21:48:06 +0700 Subject: [PATCH 172/238] release 2017.09.15 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f40cb2c4e..98ab5b6ca 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.11** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.15*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.15** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.09.11 +[debug] youtube-dl version 2017.09.15 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 38d511362..041dfd7b9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.15 Core * [downloader/fragment] Restart inconsistent incomplete fragment downloads diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 798a81d3c..6b01dc910 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -593,6 +593,7 @@ - **Openload** - **OraTV** - **orf:fm4**: radio FM4 + - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cdcb32e06..8399c04fe 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.09.11' +__version__ = '2017.09.15' From cbf85239bbb835162725cd4c8758831ca1003445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 22:13:30 +0700 Subject: [PATCH 173/238] [vgtv] Relax _VALID_URL (closes #14223) --- youtube_dl/extractor/vgtv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 0f8c156a7..c21a09c01 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -42,7 +42,7 @@ class VGTVIE(XstreamIE): ) /? (?: - \#!/(?:video|live)/| + (?:\#!/)?(?:video|live)/| embed?.*id=| articles/ )| @@ -146,7 +146,11 @@ class VGTVIE(XstreamIE): { 'url': 'abtv:140026', 'only_matching': True, - } + }, + { + 'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', + 'only_matching': True, + }, ] def _real_extract(self, url): From b763e1d68c6becc414a802a452f5aa819c5de920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 22:18:38 +0700 Subject: [PATCH 174/238] [twitch] Add support for go.twitch.tv URLs (closes #14215) --- youtube_dl/extractor/twitch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 2daf9dfac..c926c99a9 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -28,7 +28,7 @@ from ..utils import ( class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' + _VALID_URL_BASE = r'https?://(?:(?:www|go)\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' @@ -217,7 +217,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go)\.)?twitch\.tv/(?:[^/]+/v|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -458,7 +458,7 @@ class TwitchStreamIE(TwitchBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/| + (?:(?:www|go)\.)?twitch\.tv/| player\.twitch\.tv/\?.*?\bchannel= ) (?P<id>[^/#?]+) @@ -489,6 +489,9 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'https://player.twitch.tv/?channel=lotsofs', 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/food', + 'only_matching': True, }] @classmethod From 6be44a50edfe2e75e31553e7a128ce1849301958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 22:25:38 +0700 Subject: [PATCH 175/238] [dailymotion:playlist] Relax _VALID_URL (closes #14219) --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 74e991331..e9d0dd19c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -325,7 +325,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' _TESTS = [{ From a4245acef85ac2414e77cf2cda4cb39adb617241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 23:12:19 +0700 Subject: [PATCH 176/238] [noovo] Fix extraction (closes #14214) --- youtube_dl/extractor/noovo.py | 61 ++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index f7fa098a5..974de3c3e 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + js_to_json, smuggle_url, try_get, ) @@ -24,8 +25,6 @@ class NoovoIE(InfoExtractor): 'timestamp': 1491399228, 'upload_date': '20170405', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': 'RPM+', }, 'params': { @@ -37,13 +36,11 @@ class NoovoIE(InfoExtractor): 'info_dict': { 'id': '5395865725001', 'title': 'Épisode 13 : Les retrouvailles', - 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'description': 'md5:888c3330f0c1b4476c5bc99a1c040473', 'ext': 'mp4', 'timestamp': 1492019320, 'upload_date': '20170412', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': "L'amour est dans le pré", 'season_number': 5, 'episode': 'Épisode 13', @@ -58,40 +55,46 @@ class NoovoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, - video_id)['data'] + webpage = self._download_webpage(url, video_id) - content = try_get(data, lambda x: x['contents'][0]) + bc_url = BrightcoveNewIE._extract_url(self, webpage) - brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + data = self._parse_json( + self._search_regex( + r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + + title = try_get( + data, lambda x: x['video']['nom'], + compat_str) or self._html_search_meta( + 'dcterms.Title', webpage, 'title', fatal=True) + + description = self._html_search_meta( + ('dcterms.Description', 'description'), webpage, 'description') series = try_get( - data, ( - lambda x: x['show']['title'], - lambda x: x['season']['show']['title']), - compat_str) + data, lambda x: x['emission']['nom']) or self._search_regex( + r'<div[^>]+class="banner-card__subtitle h4"[^>]*>([^<]+)', + webpage, 'series', default=None) - episode = None - og = data.get('og') - if isinstance(og, dict) and og.get('type') == 'video.episode': - episode = og.get('title') + season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} + season = try_get(season_el, lambda x: x['nom'], compat_str) + season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) - video = content or data + episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} + episode = try_get(episode_el, lambda x: x['nom'], compat_str) + episode_number = int_or_none(try_get(episode_el, lambda x: x['numero'])) return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['CA']}), - 'id': brightcove_id, - 'title': video.get('title'), - 'creator': video.get('source'), - 'view_count': int_or_none(video.get('viewsCount')), + 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'title': title, + 'description': description, 'series': series, - 'season_number': int_or_none(try_get( - data, lambda x: x['season']['seasonNumber'])), + 'season': season, + 'season_number': season_number, 'episode': episode, - 'episode_number': int_or_none(data.get('episodeNumber')), + 'episode_number': episode_number, } From 68d43a61b552007a718894967b869c0f1d8ff00f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 16 Sep 2017 12:14:48 +0800 Subject: [PATCH 177/238] Ignore TTML subtitles --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a5b585f43..fbf7cecb2 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ cover/ updates_key.pem *.egg-info *.srt +*.ttml *.sbv *.vtt *.flv From 3869028ffb6be6ab719e5cf1004276dfdfd1216d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 16 Sep 2017 12:18:38 +0800 Subject: [PATCH 178/238] [utils] Use bytes-like objects in dfxp2srt This fixes handling of non-UTF8 TTML subtitles Closes #14191 --- ChangeLog | 6 ++++++ test/test_utils.py | 26 +++++++++++++++++++++++--- youtube_dl/postprocessor/ffmpeg.py | 2 +- youtube_dl/utils.py | 18 +++++++++++------- 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 041dfd7b9..ba9260e3e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Core +* [utils] Fix handling raw TTML subtitles (#14191) + + version 2017.09.15 Core diff --git a/test/test_utils.py b/test/test_utils.py index e50f3764e..efa73d0f4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1064,7 +1064,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') <p begin="3" dur="-1">Ignored, three</p> </div> </body> - </tt>''' + </tt>'''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The following line contains Chinese characters and special symbols @@ -1089,7 +1089,7 @@ Line <p begin="0" end="1">The first line</p> </div> </body> - </tt>''' + </tt>'''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The first line @@ -1115,7 +1115,7 @@ The first line <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p> </div> </body> -</tt>''' +</tt>'''.encode('utf-8') srt_data = '''1 00:00:02,080 --> 00:00:05,839 <font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font> @@ -1138,6 +1138,26 @@ part 3</font></u> ''' self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data) + dfxp_data_non_utf8 = '''<?xml version="1.0" encoding="UTF-16"?> + <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> + <body> + <div xml:lang="en"> + <p begin="0" end="1">Line 1</p> + <p begin="1" end="2">第二行</p> + </div> + </body> + </tt>'''.encode('utf-16') + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +Line 1 + +2 +00:00:01,000 --> 00:00:02,000 +第二行 + +''' + self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data) + def test_cli_option(self): self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 51256a3fb..f71d413b5 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): dfxp_file = old_file srt_file = subtitles_filename(filename, lang, 'srt') - with io.open(dfxp_file, 'rt', encoding='utf-8') as f: + with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) with io.open(srt_file, 'wt', encoding='utf-8') as f: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9e4492d40..b724e0b70 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' LEGACY_NAMESPACES = ( - ('http://www.w3.org/ns/ttml', [ - 'http://www.w3.org/2004/11/ttaf1', - 'http://www.w3.org/2006/04/ttaf1', - 'http://www.w3.org/2006/10/ttaf1', + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', ]), - ('http://www.w3.org/ns/ttml#styling', [ - 'http://www.w3.org/ns/ttml#style', + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', ]), ) @@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data): for ns in v: dfxp_data = dfxp_data.replace(ns, k) - dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') From 790d379e4df2f85ece7cab02e805643234bb5c16 Mon Sep 17 00:00:00 2001 From: Windom <windom@users.noreply.github.com> Date: Sat, 16 Sep 2017 18:39:46 +0300 Subject: [PATCH 179/238] [morningstar] Relax _VALID_URL --- youtube_dl/extractor/morningstar.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py index 320d27bdd..0093bcd6c 100644 --- a/youtube_dl/extractor/morningstar.py +++ b/youtube_dl/extractor/morningstar.py @@ -8,8 +8,8 @@ from .common import InfoExtractor class MorningstarIE(InfoExtractor): IE_DESC = 'morningstar.com' - _VALID_URL = r'https?://(?:www\.)?morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', 'info_dict': { @@ -19,7 +19,10 @@ class MorningstarIE(InfoExtractor): 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' } - } + }, { + 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 8251af63a12cd73cf2578c81dbb869232da2592c Mon Sep 17 00:00:00 2001 From: Vijay Singh <sudovijay@users.noreply.github.com> Date: Sat, 16 Sep 2017 21:15:23 +0530 Subject: [PATCH 180/238] [viki] Update app data (closes #14181) --- youtube_dl/extractor/viki.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index e9c8bf824..853e5c75f 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -23,9 +23,9 @@ class VikiBaseIE(InfoExtractor): _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' - _APP = '65535a' + _APP = '100005a' _APP_VERSION = '2.2.5.1428709186' - _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False _NETRC_MACHINE = 'viki' From 4ed2d7b7d1f67e499a46e507d957616e364565ca Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 17 Sep 2017 13:53:04 +0800 Subject: [PATCH 181/238] Fix flake8 issues after #14225 --- youtube_dl/extractor/common.py | 2 +- youtube_dl/utils.py | 33 +++++++++++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 317a9a76f..2bbbf8f4d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2452,7 +2452,7 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, port, not port is None, domain, True, + 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b724e0b70..acc4f987b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3830,23 +3830,23 @@ def cookie_to_dict(cookie): cookie_dict = { 'name': cookie.name, 'value': cookie.value, - }; + } if cookie.port_specified: cookie_dict['port'] = cookie.port if cookie.domain_specified: cookie_dict['domain'] = cookie.domain if cookie.path_specified: cookie_dict['path'] = cookie.path - if not cookie.expires is None: + if cookie.expires is not None: cookie_dict['expires'] = cookie.expires - if not cookie.secure is None: + if cookie.secure is not None: cookie_dict['secure'] = cookie.secure - if not cookie.discard is None: + if cookie.discard is not None: cookie_dict['discard'] = cookie.discard try: if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): + cookie.has_nonstandard_attr('httponly') or + cookie.has_nonstandard_attr('HttpOnly')): cookie_dict['httponly'] = True except TypeError: pass @@ -3957,7 +3957,7 @@ class PhantomJSwrapper(object): cookies = json.loads(f.read().decode('utf-8')) for cookie in cookies: if cookie['httponly'] is True: - cookie['rest'] = { 'httpOnly': None } + cookie['rest'] = {'httpOnly': None} if 'expiry' in cookie: cookie['expire_time'] = cookie['expiry'] self.extractor._set_cookie(**cookie) @@ -3965,7 +3965,7 @@ class PhantomJSwrapper(object): def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ Downloads webpage (if needed) and executes JS - + Params: url: website url html: optional, html code of website @@ -3974,11 +3974,11 @@ class PhantomJSwrapper(object): note2: optional, displayed when executing JS headers: custom http headers jscode: code to be executed when page is loaded - + Returns tuple with: * downloaded website (after JS execution) * anything you print with `console.log` (but not inside `page.execute`!) - + In most cases you don't need to add any `jscode`. It is executed in `page.onLoadFinished`. `saveAndExit();` is mandatory, use it instead of `phantom.exit()` @@ -3992,7 +3992,7 @@ class PhantomJSwrapper(object): else window.setTimeout(check, 500); } - + page.evaluate(function(){ document.querySelector('#a').click(); }); @@ -4024,13 +4024,14 @@ class PhantomJSwrapper(object): else: self.extractor.to_screen('%s: %s' % (video_id, note2)) - p = subprocess.Popen([self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen([ + self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() if p.returncode != 0: - raise ExtractorError('Executing JS failed\n:' - + encodeArgument(err)) + raise ExtractorError( + 'Executing JS failed\n:' + encodeArgument(err)) with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') From 9c2a17f2ce7b2b9dc45b603be413a943f6637498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 17 Sep 2017 22:19:57 +0700 Subject: [PATCH 182/238] [popcorntv] Add extractor (closes #5914, closes #14211) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/popcorntv.py | 78 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/popcorntv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a3a97e940..ab95c8575 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -808,6 +808,7 @@ from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, ) +from .popcorntv import PopcornTVIE from .porn91 import Porn91IE from .porncom import PornComIE from .pornflip import PornFlipIE diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py new file mode 100644 index 000000000..ac901f426 --- /dev/null +++ b/youtube_dl/extractor/popcorntv.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P<display_id>[^/]+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(<link[^>]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r'<h1[^>]+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)<article[^>]+itemprop=["\']description[^>]*>(.+?)</article>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + print(self._html_search_meta( + 'duration', webpage)) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } From 4d8c4b46d5668387cc685123f80fb64bbc7c5aff Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Sun, 17 Sep 2017 15:46:52 +0000 Subject: [PATCH 183/238] [heise] Add support for YouTube embeds --- youtube_dl/extractor/heise.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 382f32771..495ffb7dc 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( determine_ext, int_or_none, @@ -25,6 +26,22 @@ class HeiseIE(InfoExtractor): 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', 'thumbnail': r're:^https?://.*/gallery/$', } + }, { + # YouTube embed + 'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', + 'md5': 'e403d2b43fea8e405e88e3f8623909f1', + 'info_dict': { + 'id': '6kmWbXleKW4', + 'ext': 'mp4', + 'title': 'NEU IM SEPTEMBER | Netflix', + 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'upload_date': '20170830', + 'uploader': 'Netflix Deutschland, Österreich und Schweiz', + 'uploader_id': 'netflixdach', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -40,6 +57,16 @@ class HeiseIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('fulltitle', webpage, default=None) + if not title or title == "c't": + title = self._search_regex( + r'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title') + + yt_urls = YoutubeIE._extract_urls(webpage) + if yt_urls: + return self.playlist_from_matches(yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + container_id = self._search_regex( r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', webpage, 'container ID') @@ -47,12 +74,6 @@ class HeiseIE(InfoExtractor): r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', webpage, 'sequenz ID') - title = self._html_search_meta('fulltitle', webpage, default=None) - if not title or title == "c't": - title = self._search_regex( - r'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') - doc = self._download_xml( 'http://www.heise.de/videout/feed', video_id, query={ 'container': container_id, From 8a1a60d17397721620e75d83f2aad3a353286f15 Mon Sep 17 00:00:00 2001 From: Kareem Moussa <itstehkman@gmail.com> Date: Tue, 19 Sep 2017 08:51:20 -0700 Subject: [PATCH 184/238] [devscripts/check-porn] Fix gettestcases import --- devscripts/check-porn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 7a219ebe9..72b2ee422 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -14,7 +14,7 @@ import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_testcases +from test.helper import gettestcases from youtube_dl.utils import compat_urllib_parse_urlparse from youtube_dl.utils import compat_urllib_request @@ -24,7 +24,7 @@ if len(sys.argv) > 1: else: METHOD = 'EURISTIC' -for test in get_testcases(): +for test in gettestcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() From dc76eef092dc10f7e3f599fa7d85a04de8d84b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Sep 2017 23:59:36 +0700 Subject: [PATCH 185/238] [tvplay] Bypass geo restriction --- youtube_dl/extractor/tvplay.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 99ff82a5d..46132eda1 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -15,7 +15,9 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + smuggle_url, try_get, + unsmuggle_url, update_url_query, ) @@ -224,6 +226,9 @@ class TVPlayIE(InfoExtractor): ] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, @@ -426,4 +431,9 @@ class ViafreeIE(InfoExtractor): r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', webpage, 'video id') - return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) + return self.url_result( + smuggle_url( + 'mtg:%s' % video_id, + {'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]]}), + ie=TVPlayIE.ie_key(), video_id=video_id) From 3b65a6fbf31256030ff35210a7be2b50369a6c4f Mon Sep 17 00:00:00 2001 From: capital-G <dennis.scheiba@gmx.de> Date: Tue, 19 Sep 2017 22:58:06 +0200 Subject: [PATCH 186/238] [twitter] Fix duration extraction --- youtube_dl/extractor/twitter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6eaf360a6..7399cf538 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -229,7 +229,7 @@ class TwitterCardIE(TwitterBaseIE): title = self._search_regex(r'<title>([^<]+)', webpage, 'title') thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration')) or duration + duration = float_or_none(config.get('duration'), scale=1000) or duration return { 'id': video_id, @@ -255,6 +255,7 @@ class TwitterIE(InfoExtractor): 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', + 'duration': 12.922, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -305,11 +306,12 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'あかさ - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'あかさ on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'Donte', + 'uploader': 'あかさ', 'uploader_id': 'jaydingeer', + 'duration': 30.0, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -337,6 +339,7 @@ class TwitterIE(InfoExtractor): 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', 'uploader_id': 'captainamerica', 'uploader': 'Captain America', + 'duration': 3.17, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -364,6 +367,7 @@ class TwitterIE(InfoExtractor): 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', 'uploader': 'عالم الأخبار', 'uploader_id': 'news_al3alm', + 'duration': 277.4, }, 'params': { 'format': 'best[format_id^=http-]', From 12ea5c79fb0bfa878d62d130cf67057fc230dfa7 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Wed, 20 Sep 2017 14:53:06 -0500 Subject: [PATCH 187/238] [nbcsports:vplayer] Correct theplatform URL (closes #13873) --- youtube_dl/extractor/nbc.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 62db70b43..836a41f06 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -109,10 +109,10 @@ class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', 'info_dict': { 'id': '9CsDKds0kvHI', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', 'timestamp': 1426270238, @@ -120,7 +120,7 @@ class NBCSportsVPlayerIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', } }, { - 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, }] @@ -134,7 +134,8 @@ class NBCSportsVPlayerIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage) + theplatform_url = self._og_search_video_url(webpage).replace( + 'vplayer.nbcsports.com', 'player.theplatform.com') return self.url_result(theplatform_url, 'ThePlatform') From f6ff52b473c9ed969fadb3e3d50852c4a27ba17e Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano Date: Wed, 20 Sep 2017 23:05:33 +0200 Subject: [PATCH 188/238] [beeg] Fix extraction (closes #14275) --- youtube_dl/extractor/beeg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index d5c5822f2..bbeae4bac 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -9,6 +9,7 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, + urljoin, ) @@ -36,9 +37,11 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) cpl_url = self._search_regex( - r']+src=(["\'])(?P(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1', + r']+src=(["\'])(?P(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', webpage, 'cpl', default=None, group='url') + cpl_url = urljoin(url, cpl_url) + beeg_version, beeg_salt = [None] * 2 if cpl_url: @@ -54,7 +57,7 @@ class BeegIE(InfoExtractor): r'beeg_salt\s*=\s*(["\'])(?P.+?)\1', cpl, 'beeg salt', default=None, group='beeg_salt') - beeg_version = beeg_version or '2000' + beeg_version = beeg_version or '2185' beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' video = self._download_json( From 8c6919e4331e1cd44f50e700e8fc4e630d913a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Sep 2017 23:00:35 +0700 Subject: [PATCH 189/238] [lynda] Add support for educourse.ga (closes #14286) --- youtube_dl/extractor/lynda.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index d2f75296a..1b6f5091d 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,7 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?lynda\.com/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:lynda\.com|educourse\.ga)/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' @@ -110,6 +110,9 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', 'only_matching': True, + }, { + 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -253,7 +256,7 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 8c2895305dc09920055611c8120f5a65fcd2614f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 23 Sep 2017 02:30:03 +0800 Subject: [PATCH 190/238] [options] Accept lrc as a subtitle conversion target format (closes #14292) --- ChangeLog | 1 + youtube_dl/__init__.py | 2 +- youtube_dl/options.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index ba9260e3e..42ba879b1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Core ++ [options] Accept lrc as a subtitle conversion target format (#14292) * [utils] Fix handling raw TTML subtitles (#14191) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c4589411e..ba684a075 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -206,7 +206,7 @@ def _real_main(argv=None): if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') if opts.convertsubtitles is not None: - if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: parser.error('invalid subtitle format specified') if opts.date is not None: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 38439c971..4c0455044 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -847,7 +847,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--convert-subs', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, - help='Convert the subtitles to other format (currently supported: srt|ass|vtt)') + help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)') parser.add_option_group(general) parser.add_option_group(network) From 2384f5a64e501d7abb844e8d31fe340b34d8d4e7 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Wed, 6 Sep 2017 11:24:34 +0900 Subject: [PATCH 191/238] [mixcloud] Fix extraction (closes #14088) --- youtube_dl/compat.py | 10 +- youtube_dl/extractor/mixcloud.py | 157 ++++++++++++++++++++----------- 2 files changed, 109 insertions(+), 58 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e4e13bcf..2a62248ef 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -6,6 +6,7 @@ import collections import email import getpass import io +import itertools import optparse import os import re @@ -15,7 +16,6 @@ import socket import struct import subprocess import sys -import itertools import xml.etree.ElementTree @@ -2898,6 +2898,13 @@ else: compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack +try: + from future_builtins import zip as compat_zip +except ImportError: # not 2.6+ or is 3.x + try: + from itertools import izip as compat_zip # < 2.5 or 3.x + except ImportError: + compat_zip = zip __all__ = [ 'compat_HTMLParseError', @@ -2948,5 +2955,6 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', + 'compat_zip', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index f6360cce6..481182380 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,16 +9,16 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, - compat_str, compat_urllib_parse_unquote, compat_urlparse, + compat_zip ) from ..utils import ( clean_html, ExtractorError, OnDemandPagedList, str_to_int, -) + try_get) class MixcloudIE(InfoExtractor): @@ -54,27 +54,19 @@ class MixcloudIE(InfoExtractor): 'only_matching': True, }] - _keys = [ - 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };', - 'pleasedontdownloadourmusictheartistswontgetpaid', - 'window.addEventListener = window.addEventListener || function() {};', - '(function() { return new Date().toLocaleDateString(); })()' - ] - _current_key = None + @staticmethod + def _decrypt_xor_cipher(key, ciphertext): + """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(k)) + for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) - # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js - def _decrypt_play_info(self, play_info, video_id): - play_info = base64.b64decode(play_info.encode('ascii')) - for num, key in enumerate(self._keys, start=1): - try: - return self._parse_json( - ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) - for idx, ch in enumerate(play_info)]), - video_id) - except ExtractorError: - if num == len(self._keys): - raise + @staticmethod + def _decrypt_and_extend(stream_info, url_key, getter, key, formats): + maybe_url = stream_info.get(url_key) + if maybe_url is not None: + decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url)) + formats.extend(getter(decrypted)) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -84,54 +76,105 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) - if not self._current_key: - js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', - webpage, 'js url', default=None) - if js_url: - js = self._download_webpage(js_url, track_id, fatal=False) - if js: - KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' - for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): - key = self._search_regex( - KEY_RE_TEMPLATE % key_name, js, 'key', - default=None, group='key') - if key and isinstance(key, compat_str): - self._keys.insert(0, key) - self._current_key = key + # Legacy path + encrypted_play_info = self._search_regex( + r'm-play-info="([^"]+)"', webpage, 'play info', default=None) + + if encrypted_play_info is not None: + # Decode + encrypted_play_info = base64.b64decode(encrypted_play_info) + else: + # New path + full_info_json = self._parse_json(self._html_search_regex( + r'', webpage, 'play info'), 'play info') + for item in full_info_json: + item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) + if try_get(item_data, lambda x: x['streamInfo']['url']): + info_json = item_data + break + else: + raise ExtractorError('Failed to extract matching stream info') message = self._html_search_regex( r'(?s)]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) - encrypted_play_info = self._search_regex( - r'm-play-info="([^"]+)"', webpage, 'play info') + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', + webpage, 'js url', default=None) + if js_url is None: + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id) + # Known plaintext attack + if encrypted_play_info: + kps = ['{"stream_url":'] + kpa_target = encrypted_play_info + else: + kps = ['https://', 'http://'] + kpa_target = base64.b64decode(info_json['streamInfo']['url']) + for kp in kps: + partial_key = self._decrypt_xor_cipher(kpa_target, kp) + for quote in ["'", '"']: + key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, + "encryption key", default=None) + if key is not None: + break + else: + continue + break + else: + raise ExtractorError('Failed to extract encryption key') - play_info = self._decrypt_play_info(encrypted_play_info, track_id) + if encrypted_play_info is not None: + play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') + if message and 'stream_url' not in play_info: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + song_url = play_info['stream_url'] + formats = [{ + 'format_id': 'normal', + 'url': song_url + }] - if message and 'stream_url' not in play_info: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') + thumbnail = self._proto_relative_url(self._html_search_regex( + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) + uploader = self._html_search_regex( + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) + uploader_id = self._search_regex( + r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) + description = self._og_search_description(webpage) + view_count = str_to_int(self._search_regex( + [r'([0-9,.]+)', + r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], + webpage, 'play count', default=None)) - song_url = play_info['stream_url'] + else: + title = info_json['name'] + thumbnail = try_get(info_json, + lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot']) + uploader = try_get(info_json, lambda x: x['owner']['displayName']) + uploader_id = try_get(info_json, lambda x: x['owner']['username']) + description = try_get(info_json, lambda x: x['description']) + view_count = try_get(info_json, lambda x: x['plays']) - title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') - thumbnail = self._proto_relative_url(self._html_search_regex( - r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) - uploader = self._html_search_regex( - r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) - uploader_id = self._search_regex( - r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) - description = self._og_search_description(webpage) - view_count = str_to_int(self._search_regex( - [r'([0-9,.]+)', - r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], - webpage, 'play count', default=None)) + stream_info = info_json['streamInfo'] + formats = [] + self._decrypt_and_extend(stream_info, 'url', lambda x: [{ + 'format_id': 'normal', + 'url': x + }], key, formats) + self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key, + formats) + self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key, + formats) return { 'id': track_id, 'title': title, - 'url': song_url, + 'formats': formats, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, From 095774e59130c999ed8ce132f80a7164c5ee39a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 05:35:55 +0700 Subject: [PATCH 192/238] [mixcloud] Improve and simplify (closes #14132) --- youtube_dl/extractor/mixcloud.py | 71 ++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 481182380..f331db890 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, + compat_str, compat_urllib_parse_unquote, compat_urlparse, compat_zip @@ -16,9 +17,12 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + int_or_none, OnDemandPagedList, str_to_int, - try_get) + try_get, + urljoin, +) class MixcloudIE(InfoExtractor): @@ -61,13 +65,6 @@ class MixcloudIE(InfoExtractor): compat_chr(compat_ord(ch) ^ compat_ord(k)) for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) - @staticmethod - def _decrypt_and_extend(stream_info, url_key, getter, key, formats): - maybe_url = stream_info.get(url_key) - if maybe_url is not None: - decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url)) - formats.extend(getter(decrypted)) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) @@ -86,9 +83,12 @@ class MixcloudIE(InfoExtractor): else: # New path full_info_json = self._parse_json(self._html_search_regex( - r'', webpage, 'play info'), 'play info') + r'', + webpage, 'play info'), 'play info') for item in full_info_json: - item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) + item_data = try_get( + item, lambda x: x['cloudcast']['data']['cloudcastLookup'], + dict) if try_get(item_data, lambda x: x['streamInfo']['url']): info_json = item_data break @@ -100,13 +100,9 @@ class MixcloudIE(InfoExtractor): webpage, 'error message', default=None) js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', - webpage, 'js url', default=None) - if js_url is None: - js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)', - webpage, 'js url') - js = self._download_webpage(js_url, track_id) + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id, 'Downloading JS') # Known plaintext attack if encrypted_play_info: kps = ['{"stream_url":'] @@ -117,8 +113,9 @@ class MixcloudIE(InfoExtractor): for kp in kps: partial_key = self._decrypt_xor_cipher(kpa_target, kp) for quote in ["'", '"']: - key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, - "encryption key", default=None) + key = self._search_regex( + r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), + js, 'encryption key', default=None) if key is not None: break else: @@ -153,23 +150,37 @@ class MixcloudIE(InfoExtractor): else: title = info_json['name'] - thumbnail = try_get(info_json, - lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot']) + thumbnail = urljoin( + 'https://thumbnailer.mixcloud.com/unsafe/600x600/', + try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str)) uploader = try_get(info_json, lambda x: x['owner']['displayName']) uploader_id = try_get(info_json, lambda x: x['owner']['username']) description = try_get(info_json, lambda x: x['description']) - view_count = try_get(info_json, lambda x: x['plays']) + view_count = int_or_none(try_get(info_json, lambda x: x['plays'])) stream_info = info_json['streamInfo'] formats = [] - self._decrypt_and_extend(stream_info, 'url', lambda x: [{ - 'format_id': 'normal', - 'url': x - }], key, formats) - self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key, - formats) - self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key, - formats) + + for url_key in ('url', 'hlsUrl', 'dashUrl'): + format_url = stream_info.get(url_key) + if not format_url: + continue + decrypted = self._decrypt_xor_cipher(key, base64.b64decode(format_url)) + if not decrypted: + continue + if url_key == 'hlsUrl': + formats.extend(self._extract_m3u8_formats( + decrypted, track_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif url_key == 'dashUrl': + formats.extend(self._extract_mpd_formats( + decrypted, track_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': decrypted, + }) + self._sort_formats(formats) return { 'id': track_id, From 9ce1ac404648142139f8b231c674d434ad4f9ffe Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Fri, 22 Sep 2017 22:49:48 +0000 Subject: [PATCH 193/238] [generic] Fix support for multiple HTML5 videos on one page (closes #14080) --- youtube_dl/extractor/generic.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b83c18380..7d0edf09c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1879,6 +1879,15 @@ class GenericIE(InfoExtractor): 'title': 'Building A Business Online: Principal Chairs Q & A', }, }, + { + # multiple HTML5 videos on one page + 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', + 'info_dict': { + 'id': 'keyscenarios', + 'title': 'Rescue Kit 14 Free Edition - Getting started', + }, + 'playlist_count': 4, + } # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2849,13 +2858,20 @@ class GenericIE(InfoExtractor): # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: - for entry in entries: - entry.update({ + if len(entries) == 1: + entries[0].update({ 'id': video_id, 'title': video_title, }) + else: + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': '%s-%s' % (video_id, num), + 'title': '%s (%d)' % (video_title, num), + }) + for entry in entries: self._sort_formats(entry['formats']) - return self.playlist_result(entries) + return self.playlist_result(entries, video_id, video_title) jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) From 13de91c9e92bd831fee38fddbdabce7f6e82ef91 Mon Sep 17 00:00:00 2001 From: Dan Weber Date: Tue, 12 Sep 2017 22:52:54 -0400 Subject: [PATCH 194/238] [americastestkitchen] Add extractor (closes #10764) --- youtube_dl/extractor/americastestkitchen.py | 85 +++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 86 insertions(+) create mode 100755 youtube_dl/extractor/americastestkitchen.py diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py new file mode 100755 index 000000000..f231e7f6e --- /dev/null +++ b/youtube_dl/extractor/americastestkitchen.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class AmericasTestKitchenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/episode/(?P\d+)' + _TESTS = [{ + 'url': + 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'md5': 'b861c3e365ac38ad319cfd509c30577f', + 'info_dict': { + 'id': '1_5g5zua6e', + 'title': 'atk_s17_e24.mp4', + 'ext': 'mp4', + 'description': '

Host Julia Collin Davison goes into the test kitchen with test cook Dan Souza to learn how to make the ultimate Grill-Roasted Beef Tenderloin. Next, equipment expert Adam Ried reviews gas grills in the Equipment Corner. Then, gadget guru Lisa McManus uncovers the best quirky gadgets. Finally, test cook Erin McMurrer shows host Bridget Lancaster how to make an elegant Pear-Walnut Upside-Down Cake.

', + 'timestamp': 1497285541, + 'upload_date': '20170612', + 'uploader_id': 'roger.metcalf@americastestkitchen.com', + 'release_date': '2017-06-17', + 'thumbnail': 'http://d3cizcpymoenau.cloudfront.net/images/35973/e24-tenderloin-16.jpg', + 'episode_number': 24, + 'episode': 'Summer Dinner Party', + 'episode_id': '548-summer-dinner-party', + 'season_number': 17 + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': + 'https://www.americastestkitchen.com/episode/546-a-spanish-affair', + 'only_matching': + True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + partner_id = self._search_regex( + r'partner_id/(?P\d+)', + webpage, + 'partner_id', + group='partner_id') + + video_data = self._parse_json( + self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?});\s*', + webpage, 'initial context'), + video_id) + + episode_data = video_data['episodeDetail']['content']['data'] + episode_content_meta = episode_data['full_video'] + external_id = episode_content_meta['external_id'] + + # photo data + photo_data = episode_content_meta.get('photo') + thumbnail = photo_data.get('image_url') if photo_data else None + + # meta + release_date = episode_data.get('aired_at') + description = episode_content_meta.get('description') + episode_number = int(episode_content_meta.get('episode_number')) + episode = episode_content_meta.get('title') + episode_id = episode_content_meta.get('episode_slug') + season_number = int(episode_content_meta.get('season_number')) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, external_id), + 'ie_key': 'Kaltura', + 'id': video_id, + 'release_date': release_date, + 'thumbnail': thumbnail, + 'description': description, + 'episode_number': episode_number, + 'episode': episode, + 'episode_id': episode_id, + 'season_number': season_number + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ab95c8575..585300500 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -39,6 +39,7 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE +from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anvato import AnvatoIE From 4bb58fa118a8c75b2ecf05f7b29a0ae27eef6239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 06:28:46 +0700 Subject: [PATCH 195/238] [americastestkitchen] Improve (closes #13996) --- youtube_dl/extractor/americastestkitchen.py | 82 ++++++++++----------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index f231e7f6e..01736872d 100755 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,85 +1,85 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + try_get, + unified_strdate, +) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/episode/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' _TESTS = [{ - 'url': - 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'url': 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '1_5g5zua6e', - 'title': 'atk_s17_e24.mp4', + 'title': 'Summer Dinner Party', 'ext': 'mp4', - 'description': '

Host Julia Collin Davison goes into the test kitchen with test cook Dan Souza to learn how to make the ultimate Grill-Roasted Beef Tenderloin. Next, equipment expert Adam Ried reviews gas grills in the Equipment Corner. Then, gadget guru Lisa McManus uncovers the best quirky gadgets. Finally, test cook Erin McMurrer shows host Bridget Lancaster how to make an elegant Pear-Walnut Upside-Down Cake.

', + 'description': 'md5:858d986e73a4826979b6a5d9f8f6a1ec', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1497285541, 'upload_date': '20170612', 'uploader_id': 'roger.metcalf@americastestkitchen.com', - 'release_date': '2017-06-17', - 'thumbnail': 'http://d3cizcpymoenau.cloudfront.net/images/35973/e24-tenderloin-16.jpg', - 'episode_number': 24, + 'release_date': '20170617', + 'series': "America's Test Kitchen", + 'season_number': 17, 'episode': 'Summer Dinner Party', - 'episode_id': '548-summer-dinner-party', - 'season_number': 17 + 'episode_number': 24, }, 'params': { - # m3u8 download 'skip_download': True, }, }, { - 'url': - 'https://www.americastestkitchen.com/episode/546-a-spanish-affair', - 'only_matching': - True, + 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) partner_id = self._search_regex( - r'partner_id/(?P\d+)', - webpage, - 'partner_id', - group='partner_id') + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') video_data = self._parse_json( self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?});\s*', + r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', webpage, 'initial context'), video_id) - episode_data = video_data['episodeDetail']['content']['data'] - episode_content_meta = episode_data['full_video'] - external_id = episode_content_meta['external_id'] + ep_data = try_get( + video_data, + (lambda x: x['episodeDetail']['content']['data'], + lambda x: x['videoDetail']['content']['data']), dict) + ep_meta = ep_data.get('full_video', {}) + external_id = ep_data.get('external_id') or ep_meta['external_id'] - # photo data - photo_data = episode_content_meta.get('photo') - thumbnail = photo_data.get('image_url') if photo_data else None + title = ep_data.get('title') or ep_meta.get('title') + description = clean_html(ep_meta.get('episode_description') or ep_data.get( + 'description') or ep_meta.get('description')) + thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) + release_date = unified_strdate(ep_data.get('aired_at')) - # meta - release_date = episode_data.get('aired_at') - description = episode_content_meta.get('description') - episode_number = int(episode_content_meta.get('episode_number')) - episode = episode_content_meta.get('title') - episode_id = episode_content_meta.get('episode_slug') - season_number = int(episode_content_meta.get('season_number')) + season_number = int_or_none(ep_meta.get('season_number')) + episode = ep_meta.get('title') + episode_number = int_or_none(ep_meta.get('episode_number')) return { '_type': 'url_transparent', 'url': 'kaltura:%s:%s' % (partner_id, external_id), 'ie_key': 'Kaltura', - 'id': video_id, - 'release_date': release_date, - 'thumbnail': thumbnail, + 'title': title, 'description': description, - 'episode_number': episode_number, + 'thumbnail': thumbnail, + 'release_date': release_date, + 'series': "America's Test Kitchen", + 'season_number': season_number, 'episode': episode, - 'episode_id': episode_id, - 'season_number': season_number + 'episode_number': episode_number, } From 5c1452e8f1e744db14be1baef840e9f531e8f144 Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano Date: Sat, 23 Sep 2017 01:38:09 +0200 Subject: [PATCH 196/238] [twitter] Add support for user_id-less URLs (closes #14270) --- youtube_dl/extractor/twitter.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 7399cf538..0df3ad7c7 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -242,8 +242,9 @@ class TwitterCardIE(TwitterBaseIE): class TwitterIE(InfoExtractor): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P[^/]+))/status/(?P\d+)' _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' + _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -322,9 +323,9 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'FilmDrunk - Vine of the day', - 'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', - 'uploader': 'FilmDrunk', + 'title': 'Vince Mancini - Vine of the day', + 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', + 'uploader': 'Vince Mancini', 'uploader_id': 'Filmdrunk', 'timestamp': 1402826626, 'upload_date': '20140615', @@ -372,6 +373,21 @@ class TwitterIE(InfoExtractor): 'params': { 'format': 'best[format_id^=http-]', }, + }, { + 'url': 'https://twitter.com/i/web/status/910031516746514432', + 'info_dict': { + 'id': '910031516746514432', + 'ext': 'mp4', + 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'uploader': 'Préfet de Guadeloupe', + 'uploader_id': 'Prefet971', + 'duration': 47.48, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): @@ -380,11 +396,15 @@ class TwitterIE(InfoExtractor): twid = mobj.group('id') webpage, urlh = self._download_webpage_handle( - self._TEMPLATE_URL % (user_id, twid), twid) + self._TEMPLATE_STATUSES_URL % twid, twid) if 'twitter.com/account/suspended' in urlh.geturl(): raise ExtractorError('Account suspended by Twitter.', expected=True) + if user_id is None: + mobj = re.match(self._VALID_URL, urlh.geturl()) + user_id = mobj.group('user_id') + username = remove_end(self._og_search_title(webpage), ' on Twitter') title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') From 1c22d7a7f30917abfd2b7495f7bd02d51cb8528a Mon Sep 17 00:00:00 2001 From: Namnamseo <0201ssw+github@gmail.com> Date: Thu, 24 Aug 2017 11:32:24 +0900 Subject: [PATCH 197/238] [kakao] Add extractor (closes #12298) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kakao.py | 140 +++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 youtube_dl/extractor/kakao.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 585300500..4232a4fef 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -483,6 +483,7 @@ from .jove import JoveIE from .joj import JojIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE +from .kakao import KakaoIE from .kaltura import KalturaIE from .kamcord import KamcordIE from .kanalplay import KanalPlayIE diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py new file mode 100644 index 000000000..0caa41e9e --- /dev/null +++ b/youtube_dl/extractor/kakao.py @@ -0,0 +1,140 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + compat_str, + unified_timestamp, +) + + +class KakaoIE(InfoExtractor): + _VALID_URL = r'https?://tv.kakao.com/channel/(?P\d+)/cliplink/(?P\d+)' + IE_NAME = 'kakao.com' + + _TESTS = [{ + 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', + 'md5': '702b2fbdeb51ad82f5c904e8c0766340', + 'info_dict': { + 'id': '301965083', + 'ext': 'mp4', + 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', + 'uploader_id': 2671005, + 'uploader': '그랑그랑이', + 'timestamp': 1488160199, + 'upload_date': '20170227', + } + }, { + 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': '300103180', + 'ext': 'mp4', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'uploader_id': 2653210, + 'uploader': '쇼 음악중심', + 'timestamp': 1485684628, + 'upload_date': '20170129', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_url = 'http://tv.kakao.com/embed/player/cliplink/' + video_id + \ + '?service=kakao_tv&autoplay=1&profile=HIGH&wmode=transparent' + player_header = {'Referer': player_url} + + impress = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/impress' % video_id, + video_id, 'Downloading video info', + query={ + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + 'fields': 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' + }, headers=player_header) + + clipLink = impress['clipLink'] + clip = clipLink['clip'] + + video_info = { + 'id': video_id, + 'title': clip['title'], + 'description': clip.get('description'), + 'uploader': clipLink.get('channel', {}).get('name'), + 'uploader_id': clipLink.get('channelId'), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + } + + tid = impress.get('tid', '') + raw = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw' % video_id, + video_id, 'Downloading video formats info', + query={ + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'tid': tid, + 'profile': 'HIGH', + 'dteType': 'PC', + }, headers=player_header, fatal=False) + + formats = [] + for fmt in raw.get('outputList', []): + try: + profile_name = fmt['profile'] + fmt_url_json = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw/videolocation' % video_id, + video_id, 'Downloading video URL for profile %s' % profile_name, + query={ + 'service': 'kakao_tv', + 'section': '', + 'tid': tid, + 'profile': profile_name + }, headers=player_header, fatal=False) + + if fmt_url_json is None: + continue + + fmt_url = fmt_url_json['url'] + formats.append({ + 'url': fmt_url, + 'format_id': profile_name, + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + 'format_note': fmt.get('label'), + 'filesize': int_or_none(fmt.get('filesize')) + }) + except KeyError: + pass + + self._sort_formats(formats) + video_info['formats'] = formats + + top_thumbnail = clip.get('thumbnailUrl') + thumbs = [] + for thumb in clip.get('clipChapterThumbnailList', []): + thumbs.append({ + 'url': thumb.get('thumbnailUrl'), + 'id': compat_str(thumb.get('timeInSec')), + 'preference': -1 if thumb.get('isDefault') else 0 + }) + video_info['thumbnail'] = top_thumbnail + video_info['thumbnails'] = thumbs + + upload_date = unified_timestamp(clipLink.get('createTime')) + video_info['timestamp'] = upload_date + + return video_info From f70ddd4aebbfb0bdf2f63c1eba5b5614d2cfb70f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:25:15 +0700 Subject: [PATCH 198/238] [kakao] Improve (closes #14007) --- youtube_dl/extractor/kakao.py | 117 ++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 0caa41e9e..c9b438efc 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -3,16 +3,17 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, - compat_str, unified_timestamp, + update_url_query, ) class KakaoIE(InfoExtractor): _VALID_URL = r'https?://tv.kakao.com/channel/(?P\d+)/cliplink/(?P\d+)' - IE_NAME = 'kakao.com' + _API_BASE = 'http://tv.kakao.com/api/v1/ft/cliplinks' _TESTS = [{ 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', @@ -44,60 +45,57 @@ class KakaoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - player_url = 'http://tv.kakao.com/embed/player/cliplink/' + video_id + \ - '?service=kakao_tv&autoplay=1&profile=HIGH&wmode=transparent' - player_header = {'Referer': player_url} - - impress = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/impress' % video_id, - video_id, 'Downloading video info', - query={ - 'player': 'monet_html5', - 'referer': url, - 'uuid': '', - 'service': 'kakao_tv', - 'section': '', - 'dteType': 'PC', - 'fields': 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' - }, headers=player_header) - - clipLink = impress['clipLink'] - clip = clipLink['clip'] - - video_info = { - 'id': video_id, - 'title': clip['title'], - 'description': clip.get('description'), - 'uploader': clipLink.get('channel', {}).get('name'), - 'uploader_id': clipLink.get('channelId'), - 'duration': int_or_none(clip.get('duration')), - 'view_count': int_or_none(clip.get('playCount')), - 'like_count': int_or_none(clip.get('likeCount')), - 'comment_count': int_or_none(clip.get('commentCount')), + player_header = { + 'Referer': update_url_query( + 'http://tv.kakao.com/embed/player/cliplink/%s' % video_id, { + 'service': 'kakao_tv', + 'autoplay': '1', + 'profile': 'HIGH', + 'wmode': 'transparent', + }) } + QUERY_COMMON = { + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + } + + query = QUERY_COMMON.copy() + query['fields'] = 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' + impress = self._download_json( + '%s/%s/impress' % (self._API_BASE, video_id), + video_id, 'Downloading video info', + query=query, headers=player_header) + + clip_link = impress['clipLink'] + clip = clip_link['clip'] + + title = clip.get('title') or clip_link.get('displayTitle') + tid = impress.get('tid', '') + + query = QUERY_COMMON.copy() + query.update({ + 'tid': tid, + 'profile': 'HIGH', + }) raw = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw' % video_id, + '%s/%s/raw' % (self._API_BASE, video_id), video_id, 'Downloading video formats info', - query={ - 'player': 'monet_html5', - 'referer': url, - 'uuid': '', - 'service': 'kakao_tv', - 'section': '', - 'tid': tid, - 'profile': 'HIGH', - 'dteType': 'PC', - }, headers=player_header, fatal=False) + query=query, headers=player_header) formats = [] for fmt in raw.get('outputList', []): try: profile_name = fmt['profile'] fmt_url_json = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw/videolocation' % video_id, - video_id, 'Downloading video URL for profile %s' % profile_name, + '%s/%s/raw/videolocation' % (self._API_BASE, video_id), + video_id, + 'Downloading video URL for profile %s' % profile_name, query={ 'service': 'kakao_tv', 'section': '', @@ -119,11 +117,8 @@ class KakaoIE(InfoExtractor): }) except KeyError: pass - self._sort_formats(formats) - video_info['formats'] = formats - top_thumbnail = clip.get('thumbnailUrl') thumbs = [] for thumb in clip.get('clipChapterThumbnailList', []): thumbs.append({ @@ -131,10 +126,24 @@ class KakaoIE(InfoExtractor): 'id': compat_str(thumb.get('timeInSec')), 'preference': -1 if thumb.get('isDefault') else 0 }) - video_info['thumbnail'] = top_thumbnail - video_info['thumbnails'] = thumbs + top_thumbnail = clip.get('thumbnailUrl') + if top_thumbnail: + thumbs.append({ + 'url': top_thumbnail, + 'preference': 10, + }) - upload_date = unified_timestamp(clipLink.get('createTime')) - video_info['timestamp'] = upload_date - - return video_info + return { + 'id': video_id, + 'title': title, + 'description': clip.get('description'), + 'uploader': clip_link.get('channel', {}).get('name'), + 'uploader_id': clip_link.get('channelId'), + 'thumbnails': thumbs, + 'timestamp': unified_timestamp(clip_link.get('createTime')), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + 'formats': formats, + } From 7f4921b38d10c17fe354bab20b741b362c5ae0aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:26:40 +0700 Subject: [PATCH 199/238] [heise] PEP 8 --- youtube_dl/extractor/heise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 495ffb7dc..82e11a7d8 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -60,8 +60,8 @@ class HeiseIE(InfoExtractor): title = self._html_search_meta('fulltitle', webpage, default=None) if not title or title == "c't": title = self._search_regex( - r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') + r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title') yt_urls = YoutubeIE._extract_urls(webpage) if yt_urls: From 136507b39a2b48cb775249e9724eeeedb56baed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:41:22 +0700 Subject: [PATCH 200/238] [24video] Add support for 24video.adult (closes #14295) --- youtube_dl/extractor/twentyfourvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 7af11659f..cc51ca0ae 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -14,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube|adult))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', From e3440d824a7326d0ba609d8f0896203208ecc558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:42:17 +0700 Subject: [PATCH 201/238] [24video] Fix timestamp extraction and make non fatal (#14295) --- youtube_dl/extractor/twentyfourvideo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index cc51ca0ae..96e0b96e3 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -60,8 +60,8 @@ class TwentyFourVideoIE(InfoExtractor): duration = int_or_none(self._og_search_property( 'duration', webpage, 'duration', fatal=False)) timestamp = parse_iso8601(self._search_regex( - r'

', webpage, 'title', default=None) or self._og_search_title( - webpage)).strip() + webpage, default=None)) video_id = self._html_search_regex( - r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', group='id') - - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), display_id) - - formats = [] - for target in data['targetUrls']: - format_url, format_type = target.get('url'), target.get('type') - if not format_url or not format_type: - continue - if format_type == 'HLS': - formats.extend(self._extract_m3u8_formats( - format_url, display_id, entry_protocol='m3u8_native', - ext='mp4', preference=0, fatal=False, m3u8_id=format_type)) - elif format_type == 'HDS': - formats.extend(self._extract_f4m_formats( - format_url, display_id, f4m_id=format_type, fatal=False)) - elif format_type == 'MPEG_DASH': - formats.extend(self._extract_mpd_formats( - format_url, display_id, mpd_id=format_type, fatal=False)) - else: - formats.append({ - 'format_id': format_type, - 'url': format_url, - }) - self._sort_formats(formats) - - subtitles = {} - subtitle_urls = data.get('subtitleUrls') - if isinstance(subtitle_urls, list): - for subtitle in subtitle_urls: - subtitle_url = subtitle.get('url') - if subtitle_url and subtitle.get('type') == 'CLOSED': - subtitles.setdefault('nl', []).append({'url': subtitle_url}) + r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', + group='id') return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id), + 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, 'title': title, 'description': self._og_search_description(webpage), - 'formats': formats, - 'duration': float_or_none(data.get('duration'), 1000), - 'thumbnail': data.get('posterImageUrl'), - 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4232a4fef..24e9acda6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -150,7 +150,10 @@ from .camdemy import ( from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE -from .canvas import CanvasIE +from .canvas import ( + CanvasIE, + CanvasEenIE, +) from .carambatv import ( CarambaTVIE, CarambaTVPageIE, From 544ffb7790ea1ac7bbc081d2e486101c0382b900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 Oct 2017 04:15:12 +0700 Subject: [PATCH 231/238] [ketnet] Add support for videos without direct sources (closes #14377) --- youtube_dl/extractor/ketnet.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py index fb9c2dbd4..93a98e1e0 100644 --- a/youtube_dl/extractor/ketnet.py +++ b/youtube_dl/extractor/ketnet.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +from .canvas import CanvasIE from .common import InfoExtractor @@ -7,7 +8,7 @@ class KetnetIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', - 'md5': 'd907f7b1814ef0fa285c0475d9994ed7', + 'md5': '6bdeb65998930251bbd1c510750edba9', 'info_dict': { 'id': 'zomerse-filmpjes', 'ext': 'mp4', @@ -15,6 +16,20 @@ class KetnetIE(InfoExtractor): 'description': 'Gluur mee met Ghost Rockers op de filmset', 'thumbnail': r're:^https?://.*\.jpg$', } + }, { + # mzid in playerConfig instead of sources + 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook', + 'md5': '90139b746a0a9bd7bb631283f6e2a64e', + 'info_dict': { + 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', + 'ext': 'flv', + 'title': 'Nachtwacht: De Greystook', + 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1468.03, + }, + 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', 'only_matching': True, @@ -38,6 +53,12 @@ class KetnetIE(InfoExtractor): 'player config'), video_id) + mzid = config.get('mzid') + if mzid: + return self.url_result( + 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid, + CanvasIE.ie_key(), video_id=mzid) + title = config['title'] formats = [] From d2ae7e24e5c574fa45621771a134e58b21443e5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 Oct 2017 04:43:25 +0700 Subject: [PATCH 232/238] [postprocessor/ffmpeg] Convert to opus using libopus (closes #14381) --- youtube_dl/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index f71d413b5..3ea1afcf3 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -44,7 +44,7 @@ ACODECS = { 'aac': 'aac', 'flac': 'flac', 'm4a': 'aac', - 'opus': 'opus', + 'opus': 'libopus', 'vorbis': 'libvorbis', 'wav': None, } From b7e14f06a4a4fbaafc593c2f118e4b0f5d8d7937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20Ku=CC=88ch?= Date: Tue, 3 Oct 2017 15:17:28 +0200 Subject: [PATCH 233/238] Fix for JSON meta data download Added fixes according to #13651 and user @remitamine --- youtube_dl/extractor/nbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 836a41f06..35151f527 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -15,7 +15,7 @@ from ..utils import ( class NBCIE(AdobePassIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?Pn?\d+))' + _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?Pn?\d+))' _TESTS = [ { @@ -72,6 +72,7 @@ class NBCIE(AdobePassIE): def _real_extract(self, url): permalink, video_id = re.match(self._VALID_URL, url).groups() + permalink = 'http' + permalink video_data = self._download_json( 'https://api.nbc.com/v3/videos', video_id, query={ 'filter[permalink]': permalink, From 3e4cedf9e8cd3157df2457df7274d0c842421945 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Tue, 3 Oct 2017 18:28:13 +0200 Subject: [PATCH 234/238] [tvn24] Relax _VALID_URL --- youtube_dl/extractor/tvn24.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tvn24.py b/youtube_dl/extractor/tvn24.py index 12ed6039c..6590e1fd0 100644 --- a/youtube_dl/extractor/tvn24.py +++ b/youtube_dl/extractor/tvn24.py @@ -9,7 +9,7 @@ from ..utils import ( class TVN24IE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P[^/]+)' _TESTS = [{ 'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html', 'md5': 'fbdec753d7bc29d96036808275f2130c', @@ -18,7 +18,7 @@ class TVN24IE(InfoExtractor): 'ext': 'mp4', 'title': '"Święta mają być wesołe, dlatego, ludziska, wszyscy pod jemiołę"', 'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości "Szkła kontaktowego".', - 'thumbnail': 're:http://.*[.]jpeg', + 'thumbnail': 're:https?://.*[.]jpeg', } }, { 'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html', @@ -29,6 +29,9 @@ class TVN24IE(InfoExtractor): }, { 'url': 'http://tvn24bis.pl/poranek,146,m/gen-koziej-w-tvn24-bis-wracamy-do-czasow-zimnej-wojny,715660.html', 'only_matching': True, + }, { + 'url': 'https://www.tvn24.pl/magazyn-tvn24/angie-w-jednej-czwartej-polka-od-szarej-myszki-do-cesarzowej-europy,119,2158', + 'only_matching': True, }] def _real_extract(self, url): From 9524dca3accc1e60cddd141f06924ed7404db9bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 Oct 2017 02:53:20 +0700 Subject: [PATCH 235/238] [README.md] Use revision bound link to YoutubeDL options (closes #14401) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7818e58df..2879aad24 100644 --- a/README.md +++ b/README.md @@ -1167,7 +1167,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L129-L279). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: From c110944fa2f21af733b4f3168764e1b008e11514 Mon Sep 17 00:00:00 2001 From: "M.K" Date: Tue, 3 Oct 2017 22:50:27 +0200 Subject: [PATCH 236/238] [extractor/common] Fix typo in _parse_mpd_formats --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2bbbf8f4d..a878550c2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1920,7 +1920,7 @@ class InfoExtractor(object): # can't be used at the same time if '%(Number' in media_template and 's' not in representation_ms_info: segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration': + if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['fragments'] = [{ From 6e736d86e7d03d14279c403202fe2a632e6e5023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 Oct 2017 04:27:42 +0700 Subject: [PATCH 237/238] [beeg] Fix extraction (closes #14403) --- youtube_dl/extractor/beeg.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index bbeae4bac..bf22a41b7 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -60,9 +60,13 @@ class BeegIE(InfoExtractor): beeg_version = beeg_version or '2185' beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' - video = self._download_json( - 'https://api.beeg.com/api/v6/%s/video/%s' % (beeg_version, video_id), - video_id) + for api_path in ('', 'api.'): + video = self._download_json( + 'https://%sbeeg.com/api/v6/%s/video/%s' + % (api_path, beeg_version, video_id), video_id, + fatal=api_path == 'api.') + if video: + break def split(o, e): def cut(s, x): From 6b46285e850f8bc8155a93adf13920752537e55d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 4 Oct 2017 07:45:13 +0200 Subject: [PATCH 238/238] [comedycentral] new shortcut :theopposition for "The Opposition" show --- youtube_dl/extractor/comedycentral.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 4cac29415..d08b909a6 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -120,13 +120,16 @@ class ComedyCentralTVIE(MTVServicesInfoExtractor): class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?Ptds|thedailyshow)$' + _VALID_URL = r'^:(?Ptds|thedailyshow|theopposition)$' _TESTS = [{ 'url': ':tds', 'only_matching': True, }, { 'url': ':thedailyshow', 'only_matching': True, + }, { + 'url': ':theopposition', + 'only_matching': True, }] def _real_extract(self, url): @@ -134,5 +137,6 @@ class ComedyCentralShortnameIE(InfoExtractor): shortcut_map = { 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', } return self.url_result(shortcut_map[video_id])