From 47e0cef46e9a76e589a2b1cde1b2dfa8bcf01d8e Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sun, 16 Apr 2017 00:34:34 +0200 Subject: [PATCH 001/337] [openload] rewrite extractor --- youtube_dl/extractor/openload.py | 110 +++++++++++++++++++------------ 1 file changed, 67 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d8036b54a..789bf997e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,12 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import os import re +import subprocess +import tempfile from .common import InfoExtractor -from ..compat import compat_chr from ..utils import ( + check_executable, determine_ext, + encodeArgument, ExtractorError, ) @@ -58,6 +62,39 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] + _PHANTOMJS_SCRIPT = r''' + phantom.onError = function(msg, trace) { + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) { + msgStack.push('TRACE:'); + trace.forEach(function(t) { + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }); + } + console.error(msgStack.join('\n')); + phantom.exit(1); + }; + var page = require('webpage').create(); + page.settings.resourceTimeout = 10000; + page.onInitialized = function() { + page.evaluate(function() { + delete window._phantom; + delete window.callPhantom; + }); + }; + page.open('https://openload.co/embed/%s/', function(status) { + var info = page.evaluate(function() { + return { + decoded_id: document.getElementById('streamurl').innerHTML, + title: document.querySelector('meta[name="og:title"],' + + 'meta[name=description]').content + }; + }); + console.log(info.decoded_id + ' ' + info.title); + phantom.exit(); + });''' + @staticmethod def _extract_urls(webpage): return re.findall( @@ -65,61 +102,48 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): + exe = check_executable('phantomjs', ['-v']) + if not exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + video_id = self._match_id(url) - webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) + url = 'https://openload.co/embed/%s/' % video_id + webpage = self._download_webpage(url, video_id) if 'File not found' in webpage or 'deleted by the owner' in webpage: - raise ExtractorError('File not found', expected=True) + raise ExtractorError('File not found', expected=True, video_id=video_id) - ol_id = self._search_regex( - ']+id="[^"]+"[^>]*>([0-9A-Za-z]+)', - webpage, 'openload ID') + script_file = tempfile.NamedTemporaryFile(mode='w', delete=False) - decoded = '' - a = ol_id[0:24] - b = [] - for i in range(0, len(a), 8): - b.append(int(a[i:i + 8] or '0', 16)) - ol_id = ol_id[24:] - j = 0 - k = 0 - while j < len(ol_id): - c = 128 - d = 0 - e = 0 - f = 0 - _more = True - while _more: - if j + 1 >= len(ol_id): - c = 143 - f = int(ol_id[j:j + 2] or '0', 16) - j += 2 - d += (f & 127) << e - e += 7 - _more = f >= c - g = d ^ b[k % 3] - for i in range(4): - char_dec = (g >> 8 * i) & (c + 127) - char = compat_chr(char_dec) - if char != '#': - decoded += char - k += 1 + # write JS script to file and close it + with script_file: + script_file.write(self._PHANTOMJS_SCRIPT % video_id) - video_url = 'https://openload.co/stream/%s?mime=true' - video_url = video_url % decoded + self.to_screen('%s: Decoding video ID with PhantomJS' % video_id) - title = self._og_search_title(webpage, default=None) or self._search_regex( - r']+class=["\']title["\'][^>]*>([^<]+)', webpage, - 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', fatal=True) + p = subprocess.Popen([exe, '--ssl-protocol=any', script_file.name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Decoding failed\n:' + + encodeArgument(err)) + else: + decoded_id, title = encodeArgument(output).strip().split(' ', 1) + + os.remove(script_file.name) + + video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id entries = self._parse_html5_media_entries(url, webpage, video_id) - subtitles = entries[0]['subtitles'] if entries else None + entry = entries[0] if entries else {} + subtitles = entry.get('subtitles') info_dict = { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, # Seems all videos have extensions in their titles 'ext': determine_ext(title, 'mp4'), From da57ebaf84225240b356530cdf02d12596f0dce8 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Tue, 25 Apr 2017 01:06:14 +0200 Subject: [PATCH 002/337] [openload] separate PhantomJS code from extractor --- youtube_dl/extractor/openload.py | 78 ++++------------- youtube_dl/utils.py | 141 +++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 789bf997e..ac5e0bb08 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,17 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import os import re -import subprocess -import tempfile from .common import InfoExtractor from ..utils import ( - check_executable, determine_ext, - encodeArgument, ExtractorError, + get_element_by_id, + PhantomJSwrapper, ) @@ -62,38 +59,7 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] - _PHANTOMJS_SCRIPT = r''' - phantom.onError = function(msg, trace) { - var msgStack = ['PHANTOM ERROR: ' + msg]; - if(trace && trace.length) { - msgStack.push('TRACE:'); - trace.forEach(function(t) { - msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line - + (t.function ? ' (in function ' + t.function +')' : '')); - }); - } - console.error(msgStack.join('\n')); - phantom.exit(1); - }; - var page = require('webpage').create(); - page.settings.resourceTimeout = 10000; - page.onInitialized = function() { - page.evaluate(function() { - delete window._phantom; - delete window.callPhantom; - }); - }; - page.open('https://openload.co/embed/%s/', function(status) { - var info = page.evaluate(function() { - return { - decoded_id: document.getElementById('streamurl').innerHTML, - title: document.querySelector('meta[name="og:title"],' - + 'meta[name=description]').content - }; - }); - console.log(info.decoded_id + ' ' + info.title); - phantom.exit(); - });''' + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @staticmethod def _extract_urls(webpage): @@ -102,40 +68,27 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): - exe = check_executable('phantomjs', ['-v']) - if not exe: - raise ExtractorError('PhantomJS executable not found in PATH, ' - 'download it from http://phantomjs.org', - expected=True) - video_id = self._match_id(url) url = 'https://openload.co/embed/%s/' % video_id - webpage = self._download_webpage(url, video_id) + headers = { + 'User-Agent': self._USER_AGENT, + } + + phantom = PhantomJSwrapper(self) + webpage, _ = phantom.get(url, video_id=video_id, headers=headers) if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) - script_file = tempfile.NamedTemporaryFile(mode='w', delete=False) - - # write JS script to file and close it - with script_file: - script_file.write(self._PHANTOMJS_SCRIPT % video_id) - - self.to_screen('%s: Decoding video ID with PhantomJS' % video_id) - - p = subprocess.Popen([exe, '--ssl-protocol=any', script_file.name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output, err = p.communicate() - if p.returncode != 0: - raise ExtractorError('Decoding failed\n:' - + encodeArgument(err)) - else: - decoded_id, title = encodeArgument(output).strip().split(' ', 1) - - os.remove(script_file.name) + decoded_id = get_element_by_id('streamurl', webpage) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id + title = self._og_search_title(webpage, default=None) or self._search_regex( + r']+class=["\']title["\'][^>]*>([^<]+)', webpage, + 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', fatal=True) + entries = self._parse_html5_media_entries(url, webpage, video_id) entry = entries[0] if entries else {} subtitles = entry.get('subtitles') @@ -148,5 +101,6 @@ class OpenloadIE(InfoExtractor): # Seems all videos have extensions in their titles 'ext': determine_ext(title, 'mp4'), 'subtitles': subtitles, + 'http_headers': headers, } return info_dict diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2340bc306..94e1b07a6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3652,3 +3652,144 @@ def write_xattr(path, key, value): "Couldn't find a tool to set the xattrs. " "Install either the python 'xattr' module, " "or the 'xattr' binary.") + + +class PhantomJSwrapper(object): + """PhantomJS wrapper class""" + + _TEMPLATE = r''' + phantom.onError = function(msg, trace) {{ + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) {{ + msgStack.push('TRACE:'); + trace.forEach(function(t) {{ + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }}); + }} + console.error(msgStack.join('\n')); + phantom.exit(1); + }}; + var page = require('webpage').create(); + var fs = require('fs'); + var read = {{ mode: 'r', charset: 'utf-8' }}; + var write = {{ mode: 'w', charset: 'utf-8' }}; + page.settings.resourceTimeout = {timeout}; + page.settings.userAgent = "{ua}"; + page.onLoadStarted = function() {{ + page.evaluate(function() {{ + delete window._phantom; + delete window.callPhantom; + }}); + }}; + var saveAndExit = function() {{ + fs.write("{html}", page.content, write); + phantom.exit(); + }}; + page.onLoadFinished = function(status) {{ + if(page.url === "") {{ + page.setContent(fs.read("{html}", read), "{url}"); + }} + else {{ + {jscode} + }} + }}; + page.open(""); + ''' + + _TMP_FILE_NAMES = ['script', 'html'] + + def __init__(self, extractor, timeout=10000): + self.exe = check_executable('phantomjs', ['-v']) + if not self.exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + self.extractor = extractor + self.options = { + 'timeout': timeout, + } + self._TMP_FILES = {} + for name in self._TMP_FILE_NAMES: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + self._TMP_FILES[name] = tmp + + def __del__(self): + for name in self._TMP_FILE_NAMES: + try: + os.remove(self._TMP_FILES[name].name) + except: + pass + + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): + """ + Downloads webpage (if needed) and executes JS + + Params: + url: website url + html: optional, html code of website + video_id: video id + note: optional, displayed when downloading webpage + note2: optional, displayed when executing JS + headers: custom http headers + jscode: code to be executed when page is loaded + + Returns tuple with: + * downloaded website (after JS execution) + * anything you print with `console.log` (but not inside `page.execute`!) + + In most cases you don't need to add any `jscode`. + It is executed in `page.onLoadFinished`. + `saveAndExit();` is mandatory, use it instead of `phantom.exit()` + It is possible to wait for some element on the webpage, for example: + var check = function() { + var elementFound = page.evaluate(function() { + return document.querySelector('#b.done') !== null; + }); + if(elementFound) + saveAndExit(); + else + window.setTimeout(check, 500); + } + + page.evaluate(function(){ + document.querySelector('#a').click(); + }); + check(); + """ + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + if not html: + html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) + with open(self._TMP_FILES['html'].name, 'wb') as f: + f.write(html.encode('utf-8')) + + replaces = self.options + replaces['url'] = url + user_agent = headers.get('User-Agent') or std_headers['User-Agent'] + replaces['ua'] = user_agent.replace('"', '\\"') + replaces['jscode'] = jscode + + for x in self._TMP_FILE_NAMES: + replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + + with open(self._TMP_FILES['script'].name, 'wb') as f: + f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) + + if video_id is None: + self.extractor.to_screen('%s' % (note2,)) + else: + self.extractor.to_screen('%s: %s' % (video_id, note2)) + + p = subprocess.Popen([self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Executing JS failed\n:' + + encodeArgument(err)) + with open(self._TMP_FILES['html'].name, 'rb') as f: + html = f.read().decode('utf-8') + return (html, encodeArgument(out)) + From 40e41780f1d770a355f01e3c1e6fb09ff392f97e Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Tue, 25 Apr 2017 15:12:54 +0200 Subject: [PATCH 003/337] [phantomjs] add cookie support --- youtube_dl/extractor/common.py | 8 +++-- youtube_dl/utils.py | 62 +++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dcc9d628a..e54adc9f0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2343,10 +2343,12 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res - def _set_cookie(self, domain, name, value, expire_time=None): + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, None, None, domain, None, - None, '/', True, False, expire_time, '', None, None, None) + 0, name, value, port, not port is None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 94e1b07a6..9c94b7ec9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3654,6 +3654,37 @@ def write_xattr(path, key, value): "or the 'xattr' binary.") +def cookie_to_dict(cookie): + cookie_dict = { + 'name': cookie.name, + 'value': cookie.value, + }; + if cookie.port_specified: + cookie_dict['port'] = cookie.port + if cookie.domain_specified: + cookie_dict['domain'] = cookie.domain + if cookie.path_specified: + cookie_dict['path'] = cookie.path + if not cookie.expires is None: + cookie_dict['expires'] = cookie.expires + if not cookie.secure is None: + cookie_dict['secure'] = cookie.secure + if not cookie.discard is None: + cookie_dict['discard'] = cookie.discard + try: + if (cookie.has_nonstandard_attr('httpOnly') or + cookie.has_nonstandard_attr('httponly') or + cookie.has_nonstandard_attr('HttpOnly')): + cookie_dict['httponly'] = True + except TypeError: + pass + return cookie_dict + + +def cookie_jar_to_list(cookie_jar): + return [cookie_to_dict(cookie) for cookie in cookie_jar] + + class PhantomJSwrapper(object): """PhantomJS wrapper class""" @@ -3674,6 +3705,9 @@ class PhantomJSwrapper(object): var fs = require('fs'); var read = {{ mode: 'r', charset: 'utf-8' }}; var write = {{ mode: 'w', charset: 'utf-8' }}; + JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ + phantom.addCookie(x); + }}); page.settings.resourceTimeout = {timeout}; page.settings.userAgent = "{ua}"; page.onLoadStarted = function() {{ @@ -3684,6 +3718,7 @@ class PhantomJSwrapper(object): }}; var saveAndExit = function() {{ fs.write("{html}", page.content, write); + fs.write("{cookies}", JSON.stringify(phantom.cookies), write); phantom.exit(); }}; page.onLoadFinished = function(status) {{ @@ -3697,7 +3732,7 @@ class PhantomJSwrapper(object): page.open(""); ''' - _TMP_FILE_NAMES = ['script', 'html'] + _TMP_FILE_NAMES = ['script', 'html', 'cookies'] def __init__(self, extractor, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) @@ -3722,6 +3757,26 @@ class PhantomJSwrapper(object): except: pass + def _save_cookies(self, url): + cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) + for cookie in cookies: + if 'path' not in cookie: + cookie['path'] = '/' + if 'domain' not in cookie: + cookie['domain'] = compat_urlparse.urlparse(url).netloc + with open(self._TMP_FILES['cookies'].name, 'wb') as f: + f.write(json.dumps(cookies).encode('utf-8')) + + def _load_cookies(self): + with open(self._TMP_FILES['cookies'].name, 'rb') as f: + cookies = json.loads(f.read().decode('utf-8')) + for cookie in cookies: + if cookie['httponly'] is True: + cookie['rest'] = { 'httpOnly': None } + if 'expiry' in cookie: + cookie['expire_time'] = cookie['expiry'] + self.extractor._set_cookie(**cookie) + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ Downloads webpage (if needed) and executes JS @@ -3765,6 +3820,8 @@ class PhantomJSwrapper(object): with open(self._TMP_FILES['html'].name, 'wb') as f: f.write(html.encode('utf-8')) + self._save_cookies(url) + replaces = self.options replaces['url'] = url user_agent = headers.get('User-Agent') or std_headers['User-Agent'] @@ -3791,5 +3848,8 @@ class PhantomJSwrapper(object): + encodeArgument(err)) with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') + + self._load_cookies() + return (html, encodeArgument(out)) From fcace2d1adac5d1f306b22219fde3a4542bcd719 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 10:30:45 +0200 Subject: [PATCH 004/337] [openload] raise `not found` before executing js --- youtube_dl/extractor/openload.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index ac5e0bb08..0adf17765 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -74,12 +74,14 @@ class OpenloadIE(InfoExtractor): 'User-Agent': self._USER_AGENT, } - phantom = PhantomJSwrapper(self) - webpage, _ = phantom.get(url, video_id=video_id, headers=headers) + webpage = self._download_webpage(url, video_id, headers=headers) if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) + phantom = PhantomJSwrapper(self) + webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) + decoded_id = get_element_by_id('streamurl', webpage) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id From 98f9d873814da2a8584cc30c0e197c15ed249db3 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 12:41:42 +0200 Subject: [PATCH 005/337] [phantomjs] Add required version checking --- youtube_dl/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9c94b7ec9..84aaac664 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3734,13 +3734,22 @@ class PhantomJSwrapper(object): _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - def __init__(self, extractor, timeout=10000): + def __init__(self, extractor, required_version=None, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: raise ExtractorError('PhantomJS executable not found in PATH, ' 'download it from http://phantomjs.org', expected=True) + self.extractor = extractor + + if required_version: + version = get_exe_version(self.exe, version_re=r'([0-9.]+)') + if is_outdated_version(version, required_version): + self.extractor._downloader.report_warning( + 'Your copy of PhantomJS is outdated, update it to version ' + '%s or newer if you encounter any errors.' % required_version) + self.options = { 'timeout': timeout, } From 7552f96352f35cd877e52fd0770b77ba1856fc62 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 12:41:57 +0200 Subject: [PATCH 006/337] [openload] Add required version --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 0adf17765..292476ef8 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -79,7 +79,7 @@ class OpenloadIE(InfoExtractor): if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) - phantom = PhantomJSwrapper(self) + phantom = PhantomJSwrapper(self, required_version='2.0') webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) decoded_id = get_element_by_id('streamurl', webpage) From 5ff1bc0cc10bf3006834c4b49fc36d733c83ce5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 28 Apr 2017 22:25:20 +0100 Subject: [PATCH 007/337] [YoutubeDL] write raw subtitle files --- youtube_dl/YoutubeDL.py | 43 +++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb465c425..c7100bb91 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1696,29 +1696,30 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - if sub_info.get('data') is not None: - sub_data = sub_info['data'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: - try: - sub_data = ie._download_webpage( - sub_info['url'], info_dict['id'], note=False) - except ExtractorError as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err.cause))) - continue - try: - sub_filename = subtitles_filename(filename, sub_lang, sub_format) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - # Use newline='' to prevent conversion of newline characters - # See https://github.com/rg3/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_data) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return + try: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) + except (ExtractorError, IOError, OSError, ValueError) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, error_to_compat_str(err))) + continue if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) From feee8d32e45c9521426cf4a089c70f37542f0065 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 3 Aug 2017 14:17:25 +0200 Subject: [PATCH 008/337] [phantomjs] add exe version to debug info --- youtube_dl/YoutubeDL.py | 2 ++ youtube_dl/utils.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb465c425..033b50702 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -86,6 +86,7 @@ from .utils import ( write_string, YoutubeDLCookieProcessor, YoutubeDLHandler, + PhantomJSwrapper, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -2146,6 +2147,7 @@ class YoutubeDL(object): exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( '%s %s' % (exe, v) for exe, v in sorted(exe_versions.items()) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c67f95ac9..4d0685d83 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3895,6 +3895,10 @@ class PhantomJSwrapper(object): _TMP_FILE_NAMES = ['script', 'html', 'cookies'] + @staticmethod + def _version(): + return get_exe_version('phantomjs', version_re=r'([0-9.]+)') + def __init__(self, extractor, required_version=None, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: @@ -3905,7 +3909,7 @@ class PhantomJSwrapper(object): self.extractor = extractor if required_version: - version = get_exe_version(self.exe, version_re=r'([0-9.]+)') + version = self._version() if is_outdated_version(version, required_version): self.extractor._downloader.report_warning( 'Your copy of PhantomJS is outdated, update it to version ' From 8d81f3e36d3caaeabcdff99d3340d4075d30741e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Sep 2017 00:57:14 +0700 Subject: [PATCH 009/337] [youtube] Force old layout for each webpage (closes #14083) --- youtube_dl/extractor/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 563cf6274..953e38227 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -245,6 +246,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _download_webpage(self, *args, **kwargs): + kwargs.setdefault('query', {})['disable_polymer'] = 'true' + return super(YoutubeBaseInfoExtractor, self)._download_webpage( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -2052,7 +2058,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): | (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ From 8681ed7fc80a6b4b99e3a57adf5ec8d4177a601c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 01:04:22 +0700 Subject: [PATCH 010/337] [ChangeLog] Actualize --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index ef9ac4660..d89fb3d18 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version <unreleased> + +Extractors +* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, + #14077, #14079, #14082, #14083, #14094, #14095, #14096) +* [youtube] Fix upload date extraction (#14065) ++ [charlierose] Add support for episodes (#14062) ++ [bbccouk] Add support for w-prefixed ids (#14056) +* [googledrive] Extend URL regular expression (#9785) ++ [googledrive] Add support for source format (#14046) +* [pornhd] Fix extraction (#14005) + + version 2017.08.27.1 Extractors From a2022b0c406286aa3d5101f1c85e7d11453e89d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 01:08:32 +0700 Subject: [PATCH 011/337] release 2017.09.02 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7959d910b..bd9e21983 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.27.1*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.27.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.02** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.08.27.1 +[debug] youtube-dl version 2017.09.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d89fb3d18..c439c8ef9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.02 Extractors * [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7504ef0f0..60ed35de9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.08.27.1' +__version__ = '2017.09.02' From a3431e12249530aa7a6962e54bcdbfce190c4c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 15:33:54 +0700 Subject: [PATCH 012/337] [radiocanada] Skip unsupported platforms (closes #14100) --- youtube_dl/extractor/radiocanada.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 3b40002a8..6bbc2781c 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -59,6 +59,7 @@ class RadioCanadaIE(InfoExtractor): device_types.append('android') formats = [] + error = None # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file for device_type in device_types: @@ -84,8 +85,8 @@ class RadioCanadaIE(InfoExtractor): if not v_url: continue if v_url == 'null': - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, xpath_text(v_data, 'message')), expected=True) + error = xpath_text(v_data, 'message') + continue ext = determine_ext(v_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -129,6 +130,9 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_f4m_formats( base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) + if not formats and error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) subtitles = {} From 64f0e30b93db804323a6db382d4ccdf1dac4e38b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 2 Sep 2017 15:44:49 +0700 Subject: [PATCH 013/337] [viidea] Capture and output lecture error message (#14099) --- youtube_dl/extractor/viidea.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 4adcd1830..a0abbae60 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,12 +4,14 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urlparse, + compat_HTTPError, compat_str, + compat_urlparse, ) from ..utils import ( - parse_duration, + ExtractorError, js_to_json, + parse_duration, parse_iso8601, ) @@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor): base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json( - '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), - lecture_id)['lecture'][0] + try: + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json( + e.cause.read().decode('utf-8'), lecture_id) + raise ExtractorError(msg['detail'], expected=True) + raise lecture_info = { 'id': lecture_id, From 503115540d8f135dc944ae48e40ba78f36238867 Mon Sep 17 00:00:00 2001 From: dubber0 <rexa.mose@gmail.com> Date: Sat, 22 Jul 2017 21:32:51 +0200 Subject: [PATCH 014/337] [aliexpress:live] Add extractor --- youtube_dl/extractor/aliexpress.py | 40 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 41 insertions(+) create mode 100644 youtube_dl/extractor/aliexpress.py diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py new file mode 100644 index 000000000..3997213f8 --- /dev/null +++ b/youtube_dl/extractor/aliexpress.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +import re + +from .common import InfoExtractor +from ..utils import try_get, float_or_none +from ..compat import compat_str + + +class AliExpressLiveIE(InfoExtractor): + + _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>[0-9]{16})' + _TEST = { + 'url': 'https://live.aliexpress.com/live/2800002704436634', + 'md5': '7ac2bc46afdd18f0b45a0a340fc47ffe', + 'info_dict': { + 'id': '2800002704436634', + 'ext': 'm3u8', + 'title': 'CASIMA7.22', + 'uploader': 'CASIMA Official Store', + 'upload_date': '20170714', + 'timestamp': 1500027138, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + page = self._download_webpage(url, video_id) + run_params_json = self._search_regex(r'runParams = (.+)[\s+]var myCtl', page, 'runParams', flags=re.DOTALL) + run_params = self._parse_json(run_params_json, video_id) + + return { + 'id': video_id, + 'title': run_params['title'], + 'url': run_params['replyStreamUrl'], + 'uploader': try_get(run_params, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(try_get(run_params, lambda x: x['followBar']['createTime']) / 1000), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 17048fd6e..d335f9fff 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -45,6 +45,7 @@ from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( From 23b2df82c70a832e485aaf52befa26e27a904995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 16:04:36 +0700 Subject: [PATCH 015/337] [aliexpress:live] Fix issues (closes #13698, closes #13707) --- youtube_dl/extractor/aliexpress.py | 47 +++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py index 3997213f8..6f241e683 100644 --- a/youtube_dl/extractor/aliexpress.py +++ b/youtube_dl/extractor/aliexpress.py @@ -1,40 +1,53 @@ # coding: utf-8 from __future__ import unicode_literals - -import re - from .common import InfoExtractor -from ..utils import try_get, float_or_none from ..compat import compat_str +from ..utils import ( + float_or_none, + try_get, +) class AliExpressLiveIE(InfoExtractor): - - _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>[0-9]{16})' + _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)' _TEST = { 'url': 'https://live.aliexpress.com/live/2800002704436634', - 'md5': '7ac2bc46afdd18f0b45a0a340fc47ffe', + 'md5': 'e729e25d47c5e557f2630eaf99b740a5', 'info_dict': { 'id': '2800002704436634', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'CASIMA7.22', + 'thumbnail': r're:http://.*\.jpg', 'uploader': 'CASIMA Official Store', - 'upload_date': '20170714', - 'timestamp': 1500027138, + 'timestamp': 1500717600, + 'upload_date': '20170722', }, } def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) - run_params_json = self._search_regex(r'runParams = (.+)[\s+]var myCtl', page, 'runParams', flags=re.DOTALL) - run_params = self._parse_json(run_params_json, video_id) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var', + webpage, 'runParams'), + video_id) + + title = data['title'] + + formats = self._extract_m3u8_formats( + data['replyStreamUrl'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') return { 'id': video_id, - 'title': run_params['title'], - 'url': run_params['replyStreamUrl'], - 'uploader': try_get(run_params, lambda x: x['followBar']['name'], compat_str), - 'timestamp': float_or_none(try_get(run_params, lambda x: x['followBar']['createTime']) / 1000), + 'title': title, + 'thumbnail': data.get('coverUrl'), + 'uploader': try_get( + data, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(data.get('startTimeLong'), scale=1000), + 'formats': formats, } From 73602bcd0c254b735cc93ce5ffeca9e98228190e Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi <ishitatsuyuki@gmail.com> Date: Fri, 1 Sep 2017 17:08:24 +0900 Subject: [PATCH 016/337] [soundcloud] Fix download URL with private tracks --- youtube_dl/extractor/soundcloud.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 2e52e092b..23dcac803 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,8 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re from .common import ( InfoExtractor, @@ -17,7 +17,7 @@ from ..utils import ( ExtractorError, int_or_none, unified_strdate, -) + update_url_query) class SoundcloudIE(InfoExtractor): @@ -160,11 +160,13 @@ class SoundcloudIE(InfoExtractor): 'license': info.get('license'), } formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token is not None: + query['secret_token'] = secret_token if info.get('downloadable', False): # We can build a direct link to the song - format_url = ( - 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format( - track_id, self._CLIENT_ID)) + format_url = update_url_query( + 'https://api.soundcloud.com/tracks/{0}/download'.format(track_id), query) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), @@ -176,10 +178,7 @@ class SoundcloudIE(InfoExtractor): # We have to retrieve the url format_dict = self._download_json( 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id, - track_id, 'Downloading track url', query={ - 'client_id': self._CLIENT_ID, - 'secret_token': secret_token, - }) + track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): abr = int_or_none(self._search_regex( @@ -216,7 +215,7 @@ class SoundcloudIE(InfoExtractor): # cannot be always used, sometimes it can give an HTTP 404 error formats.append({ 'format_id': 'fallback', - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'url': update_url_query(info['stream_url'], query), 'ext': ext, }) From d7c7100e3d920512a11bf7c6fee21e26da7ffa73 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Sun, 3 Sep 2017 16:18:24 +0700 Subject: [PATCH 017/337] [soundcloud] Simplify and add test (closes #14093) --- youtube_dl/extractor/soundcloud.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 23dcac803..1c6799d57 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -17,7 +17,8 @@ from ..utils import ( ExtractorError, int_or_none, unified_strdate, - update_url_query) + update_url_query, +) class SoundcloudIE(InfoExtractor): @@ -120,6 +121,21 @@ class SoundcloudIE(InfoExtractor): 'license': 'cc-by-sa', }, }, + # private link, downloadable format + { + 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd', + 'md5': '64a60b16e617d41d0bef032b7f55441e', + 'info_dict': { + 'id': '340344461', + 'ext': 'wav', + 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]', + 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366', + 'uploader': 'Ori Uplift Music', + 'upload_date': '20170831', + 'duration': 7449, + 'license': 'all-rights-reserved', + }, + }, ] _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg' @@ -166,7 +182,7 @@ class SoundcloudIE(InfoExtractor): if info.get('downloadable', False): # We can build a direct link to the song format_url = update_url_query( - 'https://api.soundcloud.com/tracks/{0}/download'.format(track_id), query) + 'https://api.soundcloud.com/tracks/%s/download' % track_id, query) formats.append({ 'format_id': 'download', 'ext': info.get('original_format', 'mp3'), From 0cbb841ba94c8d813ff81e817154c5491a796f20 Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Thu, 31 Aug 2017 12:56:37 +0200 Subject: [PATCH 018/337] [bpb] Fix extraction (closes #14043) --- youtube_dl/extractor/bpb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 9661ade4f..14bc0f77d 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,13 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - quality = video_info['quality'] video_url = video_info['src'] + quality = 'high' if re.search(r'_high\.', video_url) else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, From c1c1585b316995ca47b59e8dc1e3b463beb1c54a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 16:38:43 +0700 Subject: [PATCH 019/337] [bpb] Improve (closes #14086) --- youtube_dl/extractor/bpb.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 14bc0f77d..07833532e 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,18 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src\s*:\s*'https://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: - video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - video_url = video_info['src'] - quality = 'high' if re.search(r'_high\.', video_url) else 'low' + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, From 0b4a8eb3ac823c26b037eb368c114ce6d976c5c3 Mon Sep 17 00:00:00 2001 From: theychx <boomingzooming@gmail.com> Date: Mon, 28 Aug 2017 21:35:57 +0200 Subject: [PATCH 020/337] [vidme:user] Relax _VALID_URLs --- youtube_dl/extractor/vidme.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index a7971d72e..39b65ed2f 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -263,29 +263,43 @@ class VidmeListBaseIE(InfoExtractor): class VidmeUserIE(VidmeListBaseIE): IE_NAME = 'vidme:user' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)' _API_ITEM = 'list' _TITLE = 'Videos' - _TEST = { - 'url': 'https://vid.me/EFARCHIVE', + _TESTS = [{ + 'url': 'https://vid.me/MasakoX', 'info_dict': { - 'id': '3834632', - 'title': 'EFARCHIVE - %s' % _TITLE, + 'id': '16112341', + 'title': 'MasakoX - %s' % _TITLE, }, - 'playlist_mincount': 238, - } + 'playlist_mincount': 191, + }, { + 'url': 'https://vid.me/unsQuare_netWork', + 'info_dict': { + 'id': '16148757', + 'title': 'unsQuare_netWork - %s' % _TITLE, + }, + 'playlist_mincount': 73, + }] class VidmeUserLikesIE(VidmeListBaseIE): IE_NAME = 'vidme:user:likes' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes' + _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes' _API_ITEM = 'likes' _TITLE = 'Likes' - _TEST = { + _TESTS = [{ 'url': 'https://vid.me/ErinAlexis/likes', 'info_dict': { 'id': '6483530', 'title': 'ErinAlexis - %s' % _TITLE, }, 'playlist_mincount': 415, - } + }, { + 'url': 'https://vid.me/Kaleidoscope-Ish/likes', + 'info_dict': { + 'id': '16908594', + 'title': 'Kaleidoscope-Ish - %s' % _TITLE, + }, + 'playlist_mincount': 43, + }] From bc35f075370ed1e67fe71c544e6243a2fc4fa430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 17:02:11 +0700 Subject: [PATCH 021/337] [vidme:user] Make tests only matching (closes #14054) --- youtube_dl/extractor/vidme.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 39b65ed2f..59adb2377 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -275,11 +275,7 @@ class VidmeUserIE(VidmeListBaseIE): 'playlist_mincount': 191, }, { 'url': 'https://vid.me/unsQuare_netWork', - 'info_dict': { - 'id': '16148757', - 'title': 'unsQuare_netWork - %s' % _TITLE, - }, - 'playlist_mincount': 73, + 'only_matching': True, }] @@ -297,9 +293,5 @@ class VidmeUserLikesIE(VidmeListBaseIE): 'playlist_mincount': 415, }, { 'url': 'https://vid.me/Kaleidoscope-Ish/likes', - 'info_dict': { - 'id': '16908594', - 'title': 'Kaleidoscope-Ish - %s' % _TITLE, - }, - 'playlist_mincount': 43, + 'only_matching': True, }] From e9b865267aaa90e3b9e1b0468d20a4df31e13393 Mon Sep 17 00:00:00 2001 From: John D <jdong1992@gmail.com> Date: Wed, 30 Aug 2017 00:14:43 -0700 Subject: [PATCH 022/337] [manyvids] Add support for preview videos (closes #14053) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/manyvids.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/manyvids.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d335f9fff..46a11f3ef 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -564,6 +564,7 @@ from .mangomolo import ( MangomoloVideoIE, MangomoloLiveIE, ) +from .manyvids import ManyVidsIE from .matchtv import MatchTVIE from .mdr import MDRIE from .mediaset import MediasetIE diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py new file mode 100644 index 000000000..ea739ce3f --- /dev/null +++ b/youtube_dl/extractor/manyvids.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class ManyVidsIE(InfoExtractor): + _VALID_URL = r'https?://www.manyvids\.com/Video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', + 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', + 'info_dict': { + 'id': '133957', + 'ext': 'mp4', + 'title': 'everthing about me', + + } + } + + def _real_extract(self, url): + formats = [] + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_url = compat_urllib_parse_unquote(self._search_regex( + r'data-video-filepath=\"(.+?)\"', webpage, 'video URL', default='')) + + title = self._html_search_regex(r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + formats.append({ + 'url': video_url + }) + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } From efc57145c10bdf22da9d8571c35ccd0404e3b7c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 3 Sep 2017 17:30:02 +0700 Subject: [PATCH 023/337] [manyvids] Improve (closes #14059) --- youtube_dl/extractor/manyvids.py | 40 +++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index ea739ce3f..b94b3c2ab 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -2,35 +2,47 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..utils import int_or_none class ManyVidsIE(InfoExtractor): - _VALID_URL = r'https?://www.manyvids\.com/Video/(?P<id>[0-9]+)' + _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)' _TEST = { 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', 'info_dict': { 'id': '133957', 'ext': 'mp4', - 'title': 'everthing about me', - - } + 'title': 'everthing about me (Preview)', + 'view_count': int, + 'like_count': int, + }, } def _real_extract(self, url): - formats = [] video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_url = compat_urllib_parse_unquote(self._search_regex( - r'data-video-filepath=\"(.+?)\"', webpage, 'video URL', default='')) - title = self._html_search_regex(r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') - formats.append({ - 'url': video_url - }) + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video URL', group='url') + + title = '%s (Preview)' % self._html_search_regex( + r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + + like_count = int_or_none(self._search_regex( + r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) + view_count = int_or_none(self._html_search_regex( + r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, + 'view count', default=None)) + return { 'id': video_id, 'title': title, - 'formats': formats, + 'view_count': view_count, + 'like_count': like_count, + 'formats': [{ + 'url': video_url, + }], } From 6348671c4a4e9f45cebc107ff4c148ef4970bb39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 4 Sep 2017 23:08:07 +0700 Subject: [PATCH 024/337] [arte] Relax unavailability check (closes #14112) --- youtube_dl/extractor/arte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 02613cf5d..5cde90c5b 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -82,7 +82,7 @@ class ArteTVBaseIE(InfoExtractor): vsr = player_info['VSR'] - if not vsr and not player_info.get('VRU'): + if not vsr: raise ExtractorError( 'Video %s is not available' % player_info.get('VID') or video_id, expected=True) From 880fa66f4ffa9afcfce91b5ce39f05909050da67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Sep 2017 22:45:07 +0700 Subject: [PATCH 025/337] [redtube] Fix formats extraction (closes #14122) --- youtube_dl/extractor/redtube.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index c367a6ae7..f70a75256 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -62,7 +63,23 @@ class RedTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) - else: + medias = self._parse_json( + self._search_regex( + r'mediaDefinition\s*:\s*(\[.+?\])', webpage, + 'media definitions', default='{}'), + video_id, fatal=False) + if medias and isinstance(medias, list): + for media in medias: + format_url = media.get('videoUrl') + if not format_url or not isinstance(format_url, compat_str): + continue + format_id = media.get('quality') + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + if not formats: video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') formats.append({'url': video_url}) @@ -73,7 +90,7 @@ class RedTubeIE(InfoExtractor): r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', webpage, 'upload date', fatal=False)) duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', webpage, 'view count', fatal=False)) From c5c9bf0c120d2c481124a0c3913b981cf061fb95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 5 Sep 2017 23:31:34 +0700 Subject: [PATCH 026/337] [YoutubeDL] Ensure dir existence for each requested format (closes #14116) --- youtube_dl/YoutubeDL.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5f4c93ea3..4f208f1e1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1710,12 +1710,17 @@ class YoutubeDL(object): if filename is None: return - try: - dn = os.path.dirname(sanitize_path(encodeFilename(filename))) - if dn and not os.path.exists(dn): - os.makedirs(dn) - except (OSError, IOError) as err: - self.report_error('unable to create directory ' + error_to_compat_str(err)) + def ensure_dir_exists(path): + try: + dn = os.path.dirname(path) + if dn and not os.path.exists(dn): + os.makedirs(dn) + return True + except (OSError, IOError) as err: + self.report_error('unable to create directory ' + error_to_compat_str(err)) + return False + + if not ensure_dir_exists(sanitize_path(encodeFilename(filename))): return if self.params.get('writedescription', False): @@ -1853,8 +1858,11 @@ class YoutubeDL(object): for f in requested_formats: new_info = dict(info_dict) new_info.update(f) - fname = self.prepare_filename(new_info) - fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext']) + fname = prepend_extension( + self.prepare_filename(new_info), + 'f%s' % f['format_id'], new_info['ext']) + if not ensure_dir_exists(fname): + return downloaded.append(fname) partial_success = dl(fname, new_info) success = success and partial_success From 66c9fa36c10860b380806b9de48f38d628289e03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 6 Sep 2017 00:48:37 +0700 Subject: [PATCH 027/337] [youtube] Separate methods for embeds extraction --- youtube_dl/extractor/generic.py | 33 ++++------------------------- youtube_dl/extractor/youtube.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c81efdc00..b83c18380 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2243,36 +2243,11 @@ class GenericIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') - # Look for embedded YouTube player - matches = re.findall(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) - \1''', webpage) - if matches: + # Look for YouTube embeds + youtube_urls = YoutubeIE._extract_urls(webpage) + if youtube_urls: return self.playlist_from_matches( - matches, video_id, video_title, lambda m: unescapeHTML(m[1])) - - # Look for lazyYT YouTube embed - matches = re.findall( - r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) - - # Look for Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) + youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) matches = DailymotionIE._extract_urls(webpage) if matches: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 953e38227..ad2e933ee 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1374,6 +1374,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + <iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/.+?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) From 5113b6912467619bd463c5ebefe759d07078bea1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 6 Sep 2017 00:50:25 +0700 Subject: [PATCH 028/337] [abcnews,chilloutsoze,cracked,vice,vk] Use dedicated YouTube embeds extraction routines --- youtube_dl/extractor/abcnews.py | 7 +++---- youtube_dl/extractor/chilloutzone.py | 9 ++++----- youtube_dl/extractor/cracked.py | 7 +++---- youtube_dl/extractor/vice.py | 7 +++---- youtube_dl/extractor/vk.py | 7 +++---- 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 74d54560c..f770fe901 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -7,6 +7,7 @@ import time from .amp import AMPIE from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_urlparse @@ -108,9 +109,7 @@ class AbcNewsIE(InfoExtractor): r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') full_video_url = compat_urlparse.urljoin(url, video_url) - youtube_url = self._html_search_regex( - r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"', - webpage, 'YouTube URL', default=None) + youtube_url = YoutubeIE._extract_url(webpage) timestamp = None date_str = self._html_search_regex( @@ -140,7 +139,7 @@ class AbcNewsIE(InfoExtractor): } if youtube_url: - entries = [entry, self.url_result(youtube_url, 'Youtube')] + entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] return self.playlist_result(entries) return entry diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 0206d96db..d4769da75 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -5,6 +5,7 @@ import base64 import json from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( clean_html, ExtractorError @@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor): # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) if native_platform is None: - youtube_url = self._html_search_regex( - r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - webpage, 'fallback video URL', default=None) - if youtube_url is not None: - return self.url_result(youtube_url, ie='Youtube') + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or # the own CDN diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index 94d03ce2a..f77a68ece 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( parse_iso8601, str_to_int, @@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - youtube_url = self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', - webpage, 'youtube url', default=None) + youtube_url = YoutubeIE._extract_url(webpage) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) video_url = self._html_search_regex( [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 54e207b39..b8b8bf979 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -7,6 +7,7 @@ import hashlib import json from .adobepass import AdobePassIE +from .youtube import YoutubeIE from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -261,11 +262,9 @@ class ViceArticleIE(InfoExtractor): if embed_code: return _url_res('ooyala:%s' % embed_code, 'Ooyala') - youtube_url = self._html_search_regex( - r'<iframe[^>]+src="(.*youtube\.com/.*)"', - body, 'YouTube URL', default=None) + youtube_url = YoutubeIE._extract_url(body) if youtube_url: - return _url_res(youtube_url, 'Youtube') + return _url_res(youtube_url, YoutubeIE.ie_key()) video_url = self._html_search_regex( r'data-video-url="([^"]+)"', diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index dc2719cf9..105e172d5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -25,6 +25,7 @@ from ..utils import ( from .dailymotion import DailymotionIE from .pladform import PladformIE from .vimeo import VimeoIE +from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): @@ -345,11 +346,9 @@ class VKIE(VKBaseIE): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) - youtube_url = self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', - info_page, 'youtube iframe', default=None) + youtube_url = YoutubeIE._extract_url(info_page) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) vimeo_url = VimeoIE._extract_url(url, info_page) if vimeo_url is not None: From 931edb2ada89db2bf3596ce1ad5c4d808914c7ab Mon Sep 17 00:00:00 2001 From: Olivier Bilodeau <olivier@bottomlesspit.org> Date: Fri, 8 Sep 2017 10:53:24 -0400 Subject: [PATCH 029/337] [radiocanada] Add fallback for title extraction --- youtube_dl/extractor/radiocanada.py | 45 ++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 6bbc2781c..b952e59b4 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -20,20 +20,37 @@ from ..utils import ( class RadioCanadaIE(InfoExtractor): IE_NAME = 'radiocanada' _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)' - _TEST = { - 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', - 'info_dict': { - 'id': '7184272', - 'ext': 'mp4', - 'title': 'Le parcours du tireur capté sur vidéo', - 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', - 'upload_date': '20141023', + _TESTS = [ + { + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', + 'info_dict': { + 'id': '7184272', + 'ext': 'mp4', + 'title': 'Le parcours du tireur capté sur vidéo', + 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', + 'upload_date': '20141023', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } + { + # empty Title + 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/', + 'info_dict': { + 'id': '7754998', + 'ext': 'mp4', + 'title': 'letelejournal22h', + 'description': 'INTEGRALE WEB 22H-TJ', + 'upload_date': '20170720', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + ] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -145,7 +162,7 @@ class RadioCanadaIE(InfoExtractor): return { 'id': video_id, - 'title': get_meta('Title'), + 'title': get_meta('Title') or get_meta('AV-nomEmission'), 'description': get_meta('Description') or get_meta('ShortDescription'), 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'), 'duration': int_or_none(get_meta('length')), From 51aee72d16eb844377a44c12e50dbb95cd4ced27 Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Fri, 8 Sep 2017 15:13:17 +0000 Subject: [PATCH 030/337] [README.md] Clarify how to run extractor specific test cases --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f5d00df3..28ee63f40 100644 --- a/README.md +++ b/README.md @@ -936,6 +936,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file python test/test_download.py nosetests +See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. + If you want to create a build of youtube-dl yourself, you'll need * python @@ -1003,7 +1005,7 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: From debed8d759e74507371758d2344ce5afe5e237c2 Mon Sep 17 00:00:00 2001 From: luceatnobis <wehrmeyer.martin@web.de> Date: Tue, 4 Jul 2017 11:26:02 +0200 Subject: [PATCH 031/337] [rutube:playlist] Add extractor (closes #13534) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rutube.py | 84 ++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 46a11f3ef..aefadc56f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -899,6 +899,7 @@ from .rutube import ( RutubeEmbedIE, RutubeMovieIE, RutubePersonIE, + RutubePlaylistIE, ) from .rutv import RUTVIE from .ruutu import RuutuIE diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 889fa7628..a6b17c0ef 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -7,10 +7,14 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, unified_strdate, + try_get, + int_or_none, ) @@ -42,8 +46,24 @@ class RutubeIE(InfoExtractor): }, { 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, + }, { + 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + parts = compat_urllib_parse_urlparse(url) + params = compat_parse_qs(parts.query) + + # see if URL without parameters is OK + res = super(RutubeIE, cls).suitable(url) + + if params: # we only allow pl_id parameter in the url + res = res and 'pl_id' in params and len(params) == 1 + + return res + @staticmethod def _extract_urls(webpage): return [mobj.group('url') for mobj in re.finditer( @@ -193,3 +213,67 @@ class RutubePersonIE(RutubeChannelIE): }] _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' + + +class RutubePlaylistIE(InfoExtractor): + IE_NAME = 'rutube:playlist' + IE_DESC = 'Rutube playlists' + _TESTS = [{ + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'info_dict': { + 'id': '4252', + }, + 'playlist_count': 25, + }] + + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?(?:.+)?pl_id=(?P<id>\d+)' + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/source/%s/?page=%s' + + @staticmethod + def suitable(url): + params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + return params.get('pl_id') and int_or_none(params['pl_id'][0]) \ + and params.get('pl_type') + + def _real_extract(self, url): + playlist_id = self._match_id(url) + return self._extract_playlist(playlist_id) + + def _extract_playlist(self, playlist_id): + entries = [] + for pagenum in itertools.count(1): + page_url = self._PAGE_TEMPLATE % (playlist_id, pagenum) + + # download_json will sent an accept: application/xml header + page = self._download_json(page_url, playlist_id, + "Downloading metadata for page %s" % pagenum, + headers={'Accept': 'application/json'}) + + if not page['results']: + break + + results = page['results'] + for result in results: + entry = self.url_result(result.get('video_url'), 'Rutube') + category = try_get(result, lambda x: x['category']['name']) + entry.update({ + 'id': result.get('id'), + 'uploader': try_get(result, lambda x: x['author']['name']), + 'uploader_id': try_get(result, lambda x: x['author']['id']), + 'upload_date': unified_strdate(result.get('created_ts')), + 'title': result.get('title'), + 'description': result.get('description'), + 'thumbnail': result.get('thumbnail_url'), + 'duration': int_or_none(result.get('duration')), + 'category': [category] if category else None, + 'age_limit': 18 if result.get('is_adult') else 0, + 'view_count': int_or_none(result.get('hits')), + 'is_live': result.get('is_livestream'), + 'webpage_url': result.get('video_url'), + }) + entries.append(entry) + + if page['has_next'] is False: + break + + return self.playlist_result(entries, playlist_id, page['name']) From 48b813748d91acc7e9efc15075079a03faea18ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 18:39:13 +0700 Subject: [PATCH 032/337] [rutube] Rework and generalize playlist extractors (closes #13565) --- youtube_dl/extractor/rutube.py | 216 ++++++++++++++++----------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index a6b17c0ef..5a184879f 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -12,34 +12,60 @@ from ..compat import ( ) from ..utils import ( determine_ext, - unified_strdate, + unified_timestamp, try_get, int_or_none, ) -class RutubeIE(InfoExtractor): +class RutubeBaseIE(InfoExtractor): + def _extract_video(self, video, video_id=None, require_title=True): + title = video['title'] if require_title else video.get('title') + + age_limit = video.get('is_adult') + if age_limit is not None: + age_limit = 18 if age_limit is True else 0 + + uploader_id = try_get(video, lambda x: x['author']['id']) + category = try_get(video, lambda x: x['category']['name']) + + return { + 'id': video.get('id') or video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnail_url'), + 'duration': int_or_none(video.get('duration')), + 'uploader': try_get(video, lambda x: x['author']['name']), + 'uploader_id': compat_str(uploader_id) if uploader_id else None, + 'timestamp': unified_timestamp(video.get('created_ts')), + 'category': [category] if category else None, + 'age_limit': age_limit, + 'view_count': int_or_none(video.get('hits')), + 'comment_count': int_or_none(video.get('comments_count')), + 'is_live': video.get('is_livestream'), + } + + +class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', + 'md5': '79938ade01294ef7e27574890d0d3769', 'info_dict': { 'id': '3eac3b4561676c17df9132a9a1e62e3e', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', 'duration': 80, 'uploader': 'NTDRussian', 'uploader_id': '29790', + 'timestamp': 1381943602, 'upload_date': '20131016', 'age_limit': 0, }, - 'params': { - # It requires ffmpeg (m3u8 download) - 'skip_download': True, - }, }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, @@ -49,20 +75,14 @@ class RutubeIE(InfoExtractor): }, { 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252', 'only_matching': True, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source', + 'only_matching': True, }] @classmethod def suitable(cls, url): - parts = compat_urllib_parse_urlparse(url) - params = compat_parse_qs(parts.query) - - # see if URL without parameters is OK - res = super(RutubeIE, cls).suitable(url) - - if params: # we only allow pl_id parameter in the url - res = res and 'pl_id' in params and len(params) == 1 - - return res + return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) @staticmethod def _extract_urls(webpage): @@ -72,12 +92,12 @@ class RutubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + video = self._download_json( 'http://rutube.ru/api/video/%s/?format=json' % video_id, video_id, 'Downloading video JSON') - # Some videos don't have the author field - author = video.get('author') or {} + info = self._extract_video(video, video_id) options = self._download_json( 'http://rutube.ru/api/play/options/%s/?format=json' % video_id, @@ -99,19 +119,8 @@ class RutubeIE(InfoExtractor): }) self._sort_formats(formats) - return { - 'id': video['id'], - 'title': video['title'], - 'description': video['description'], - 'duration': video['duration'], - 'view_count': video['hits'], - 'formats': formats, - 'thumbnail': video['thumbnail_url'], - 'uploader': author.get('name'), - 'uploader_id': compat_str(author['id']) if author else None, - 'upload_date': unified_strdate(video['created_ts']), - 'age_limit': 18 if video['is_adult'] else 0, - } + info['formats'] = formats + return info class RutubeEmbedIE(InfoExtractor): @@ -123,7 +132,8 @@ class RutubeEmbedIE(InfoExtractor): 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', 'info_dict': { 'id': 'a10e53b86e8f349080f718582ce4c661', - 'ext': 'mp4', + 'ext': 'flv', + 'timestamp': 1387830582, 'upload_date': '20131223', 'uploader_id': '297833', 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', @@ -131,7 +141,7 @@ class RutubeEmbedIE(InfoExtractor): 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', }, 'params': { - 'skip_download': 'Requires ffmpeg', + 'skip_download': True, }, }, { 'url': 'http://rutube.ru/play/embed/8083783', @@ -145,10 +155,51 @@ class RutubeEmbedIE(InfoExtractor): canonical_url = self._html_search_regex( r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, 'Canonical URL') - return self.url_result(canonical_url, 'Rutube') + return self.url_result(canonical_url, RutubeIE.ie_key()) -class RutubeChannelIE(InfoExtractor): +class RutubePlaylistBaseIE(RutubeBaseIE): + def _next_page_url(self, page_num, playlist_id, *args, **kwargs): + return self._PAGE_TEMPLATE % (playlist_id, page_num) + + def _entries(self, playlist_id, *args, **kwargs): + next_page_url = None + for pagenum in itertools.count(1): + page = self._download_json( + next_page_url or self._next_page_url( + pagenum, playlist_id, *args, **kwargs), + playlist_id, 'Downloading page %s' % pagenum) + + results = page.get('results') + if not results or not isinstance(results, list): + break + + for result in results: + video_url = result.get('video_url') + if not video_url or not isinstance(video_url, compat_str): + continue + entry = self._extract_video(result, require_title=False) + entry.update({ + '_type': 'url', + 'url': video_url, + 'ie_key': RutubeIE.ie_key(), + }) + yield entry + + next_page_url = page.get('next') + if not next_page_url or not page.get('has_next'): + break + + def _extract_playlist(self, playlist_id, *args, **kwargs): + return self.playlist_result( + self._entries(playlist_id, *args, **kwargs), + playlist_id, kwargs.get('playlist_name')) + + def _real_extract(self, url): + return self._extract_playlist(self._match_id(url)) + + +class RutubeChannelIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' @@ -162,27 +213,8 @@ class RutubeChannelIE(InfoExtractor): _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json' - def _extract_videos(self, channel_id, channel_title=None): - entries = [] - for pagenum in itertools.count(1): - page = self._download_json( - self._PAGE_TEMPLATE % (channel_id, pagenum), - channel_id, 'Downloading page %s' % pagenum) - results = page['results'] - if not results: - break - entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results) - if not page['has_next']: - break - return self.playlist_result(entries, channel_id, channel_title) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - return self._extract_videos(channel_id) - - -class RutubeMovieIE(RutubeChannelIE): +class RutubeMovieIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' @@ -196,11 +228,11 @@ class RutubeMovieIE(RutubeChannelIE): movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') - movie_name = movie['name'] - return self._extract_videos(movie_id, movie_name) + return self._extract_playlist( + movie_id, playlist_name=movie.get('name')) -class RutubePersonIE(RutubeChannelIE): +class RutubePersonIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' @@ -215,65 +247,33 @@ class RutubePersonIE(RutubeChannelIE): _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json' -class RutubePlaylistIE(InfoExtractor): +class RutubePlaylistIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:playlist' IE_DESC = 'Rutube playlists' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)' _TESTS = [{ - 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag', 'info_dict': { - 'id': '4252', + 'id': '3097', }, - 'playlist_count': 25, + 'playlist_count': 27, + }, { + 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source', + 'only_matching': True, }] - _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?(?:.+)?pl_id=(?P<id>\d+)' - _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/source/%s/?page=%s' + _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' @staticmethod def suitable(url): params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - return params.get('pl_id') and int_or_none(params['pl_id'][0]) \ - and params.get('pl_type') + return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) + + def _next_page_url(self, page_num, playlist_id, item_kind): + return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num) def _real_extract(self, url): - playlist_id = self._match_id(url) - return self._extract_playlist(playlist_id) - - def _extract_playlist(self, playlist_id): - entries = [] - for pagenum in itertools.count(1): - page_url = self._PAGE_TEMPLATE % (playlist_id, pagenum) - - # download_json will sent an accept: application/xml header - page = self._download_json(page_url, playlist_id, - "Downloading metadata for page %s" % pagenum, - headers={'Accept': 'application/json'}) - - if not page['results']: - break - - results = page['results'] - for result in results: - entry = self.url_result(result.get('video_url'), 'Rutube') - category = try_get(result, lambda x: x['category']['name']) - entry.update({ - 'id': result.get('id'), - 'uploader': try_get(result, lambda x: x['author']['name']), - 'uploader_id': try_get(result, lambda x: x['author']['id']), - 'upload_date': unified_strdate(result.get('created_ts')), - 'title': result.get('title'), - 'description': result.get('description'), - 'thumbnail': result.get('thumbnail_url'), - 'duration': int_or_none(result.get('duration')), - 'category': [category] if category else None, - 'age_limit': 18 if result.get('is_adult') else 0, - 'view_count': int_or_none(result.get('hits')), - 'is_live': result.get('is_livestream'), - 'webpage_url': result.get('video_url'), - }) - entries.append(entry) - - if page['has_next'] is False: - break - - return self.playlist_result(entries, playlist_id, page['name']) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + playlist_kind = qs['pl_type'][0] + playlist_id = qs['pl_id'][0] + return self._extract_playlist(playlist_id, item_kind=playlist_kind) From c7e327c4d46a9b72f3f707710194dccf6eee50d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 19:08:39 +0700 Subject: [PATCH 033/337] [utils] Introduce bool_or_none --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2554a2abd..c42dd4c3a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1815,6 +1815,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + def strip_or_none(v): return None if v is None else v.strip() From c3dd44e08577c2ae0d08951037db5d1db7a321c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 19:09:27 +0700 Subject: [PATCH 034/337] [rutube] Use bool_or_none --- youtube_dl/extractor/rutube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 5a184879f..828c03b48 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -12,9 +12,10 @@ from ..compat import ( ) from ..utils import ( determine_ext, - unified_timestamp, - try_get, + bool_or_none, int_or_none, + try_get, + unified_timestamp, ) @@ -42,7 +43,7 @@ class RutubeBaseIE(InfoExtractor): 'age_limit': age_limit, 'view_count': int_or_none(video.get('hits')), 'comment_count': int_or_none(video.get('comments_count')), - 'is_live': video.get('is_livestream'), + 'is_live': bool_or_none(video.get('is_livestream')), } From bf6ec2fea9087235c14df2a079620fcc2c17b5eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 22:08:32 +0700 Subject: [PATCH 035/337] [fox] Fix extraction (#14147) --- youtube_dl/extractor/fox.py | 121 ++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 159fdf9c4..facc665f6 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -3,56 +3,99 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - smuggle_url, - update_url_query, + int_or_none, + parse_age_limit, + parse_duration, + try_get, + unified_timestamp, ) class FOXIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.fox.com/watch/255180355939/7684182528', + _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)' + _TESTS = [{ + # clip + 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', 'md5': 'ebd296fcc41dd4b19f8115d8461a3165', 'info_dict': { - 'id': '255180355939', + 'id': '4b765a60490325103ea69888fb2bd4e8', 'ext': 'mp4', - 'title': 'Official Trailer: Gotham', - 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', - 'duration': 129, - 'timestamp': 1400020798, - 'upload_date': '20140513', - 'uploader': 'NEWA-FNG-FOXCOM', + 'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'description': 'md5:549cd9c70d413adb32ce2a779b53b486', + 'duration': 102, + 'timestamp': 1504291893, + 'upload_date': '20170901', + 'creator': 'FOX', + 'series': 'Gotham', }, - 'add_ie': ['ThePlatform'], - } + 'params': { + 'skip_download': True, + }, + }, { + # episode, geo-restricted + 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', + 'only_matching': True, + }, { + # episode, geo-restricted, tv provided required + 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), video_id) - fox_pdk_player = settings['fox_pdk_player'] - release_url = fox_pdk_player['release_url'] - query = { - 'mbr': 'true', - 'switch': 'http' - } - if fox_pdk_player.get('access') == 'locked': - ap_p = settings['foxAdobePassProvider'] - rating = ap_p.get('videoRating') - if rating == 'n/a': - rating = None - resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) + video = self._download_json( + 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, + video_id, headers={ + 'apikey': 'abdcbed02c124d393b39e818a4312055', + 'Content-Type': 'application/json', + 'Referer': url, + }) - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + title = video['name'] + + m3u8_url = self._download_json( + video['videoRelease']['url'], video_id)['playURL'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = video.get('description') + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( + video.get('duration')) or parse_duration(video.get('duration')) + timestamp = unified_timestamp(video.get('datePublished')) + age_limit = parse_age_limit(video.get('contentRating')) + + data = try_get( + video, lambda x: x['trackingData']['properties'], dict) or {} + + creator = data.get('brand') or data.get('network') or video.get('network') + + series = video.get('seriesName') or data.get( + 'seriesName') or data.get('show') + season_number = int_or_none(video.get('seasonNumber')) + episode = video.get('name') + episode_number = int_or_none(video.get('episodeNumber')) + release_year = int_or_none(video.get('releaseYear')) + + if data.get('authRequired'): + # TODO: AP + pass + + return { 'id': video_id, - }) - - return info + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'creator': creator, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'release_year': release_year, + 'formats': formats, + } From b98339b54b1de517def970a955cbbdda3e1d4874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 22:15:55 +0700 Subject: [PATCH 036/337] [ChangeLog] Actualize --- ChangeLog | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ChangeLog b/ChangeLog index c439c8ef9..86b36c37e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version <unreleased> + +Core ++ [utils] Introduce bool_or_none +* [YoutubeDL] Ensure dir existence for each requested format (#14116) + +Extractors +* [fox] Fix extraction (#14147) +* [rutube] Use bool_or_none +* [rutube] Rework and generalize playlist extractors (#13565) ++ [rutube:playlist] Add support for playlists (#13534, #13565) ++ [radiocanada] Add fallback for title extraction (#14145) +* [vk] Use dedicated YouTube embeds extraction routine +* [vice] Use dedicated YouTube embeds extraction routine +* [cracked] Use dedicated YouTube embeds extraction routine +* [chilloutzone] Use dedicated YouTube embeds extraction routine +* [abcnews] Use dedicated YouTube embeds extraction routine +* [youtube] Separate methods for embeds extraction +* [redtube] Fix formats extraction (#14122) +* [arte] Relax unavailability check (#14112) ++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) +* [vidme:user] Relax URL regular expression (#14054) +* [bpb] Fix extraction (#14043, #14086) +* [soundcloud] Fix download URL with private tracks (#14093) +* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) +* [viidea] Capture and output lecture error message (#14099) +* [radiocanada] Skip unsupported platforms (#14100) + + version 2017.09.02 Extractors From 806498cf2f35cc98cf0e6c5b46f58ca357a842de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 10 Sep 2017 22:16:55 +0700 Subject: [PATCH 037/337] release 2017.09.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 4 +++- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index bd9e21983..fb934d4ba 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.02** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.09.02 +[debug] youtube-dl version 2017.09.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8091e7b5..333acee80 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -82,6 +82,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file python test/test_download.py nosetests +See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. + If you want to create a build of youtube-dl yourself, you'll need * python @@ -149,7 +151,7 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. 8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. 9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: diff --git a/ChangeLog b/ChangeLog index 86b36c37e..99667e5e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.10 Core + [utils] Introduce bool_or_none diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dbec6c8dc..798a81d3c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -38,6 +38,7 @@ - **afreecatv**: afreecatv.com - **afreecatv:global**: afreecatv.com - **AirMozilla** + - **AliExpressLive** - **AlJazeera** - **Allocine** - **AlphaPorno** @@ -437,6 +438,7 @@ - **MakerTV** - **mangomolo:live** - **mangomolo:video** + - **ManyVids** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** @@ -701,6 +703,7 @@ - **rutube:embed**: Rutube embedded videos - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos + - **rutube:playlist**: Rutube playlists - **RUTV**: RUTV.RU - **Ruutu** - **Ruv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 60ed35de9..736f753df 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.09.02' +__version__ = '2017.09.10' From f12a6e88b2c2632b10c156eb94d91675327485f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 03:22:27 +0700 Subject: [PATCH 038/337] [rutube:playlist] Fix suitable (closes #14166) --- youtube_dl/extractor/rutube.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 828c03b48..89d89b65a 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -265,8 +265,10 @@ class RutubePlaylistIE(RutubePlaylistBaseIE): _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json' - @staticmethod - def suitable(url): + @classmethod + def suitable(cls, url): + if not super(RutubePlaylistIE, cls).suitable(url): + return False params = compat_parse_qs(compat_urllib_parse_urlparse(url).query) return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0]) From 43df248f10548e3c43f0f02584a360136f1129d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 03:27:43 +0700 Subject: [PATCH 039/337] [ChangeLog] Actualize --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index 99667e5e2..189276408 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [rutube:playlist] Fix suitable (#14166) + + version 2017.09.10 Core From 7dacceae75d3c513f442cfd20d778a31bb35d3d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 03:30:33 +0700 Subject: [PATCH 040/337] release 2017.09.11 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index fb934d4ba..f40cb2c4e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.10*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.11** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.09.10 +[debug] youtube-dl version 2017.09.11 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 189276408..c286da6c6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.11 Extractors * [rutube:playlist] Fix suitable (#14166) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 736f753df..cdcb32e06 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.09.10' +__version__ = '2017.09.11' From 2709d9fa28155f7abc84d3b57ce4491391d185ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 04:14:54 +0700 Subject: [PATCH 041/337] [animeondemand] Add support for flash videos (closes #9944) --- youtube_dl/extractor/animeondemand.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9e28f2579..c22530778 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -46,6 +46,10 @@ class AnimeOnDemandIE(InfoExtractor): # Full length film, non-series, ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/185', 'only_matching': True, + }, { + # Flash videos + 'url': 'https://www.anime-on-demand.de/anime/12', + 'only_matching': True, }] def _login(self): @@ -120,10 +124,11 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): + r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html): attributes = extract_attributes(input_) + title = attributes.get('data-dialog-header') playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist'): + for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): playlist_url = attributes.get(playlist_key) if isinstance(playlist_url, compat_str) and re.match( r'/?[\da-zA-Z]+', playlist_url): @@ -160,6 +165,23 @@ class AnimeOnDemandIE(InfoExtractor): fatal=False) if not playlist: continue + stream_url = playlist.get('streamurl') + if stream_url: + rtmp = re.search( + r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', + stream_url) + if rtmp: + formats.append({ + 'url': rtmp.group('url'), + 'app': rtmp.group('app'), + 'play_path': rtmp.group('playpath'), + 'page_url': url, + 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', + 'rtmp_real_time': True, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + continue start_video = playlist.get('startvideo', 0) playlist = playlist.get('playlist') if not playlist or not isinstance(playlist, list): From 018cc61549f417cf1e88af46ff68a17b75e62630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 04:22:55 +0700 Subject: [PATCH 042/337] [animeondemand] Bypass geo restriction --- youtube_dl/extractor/animeondemand.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index c22530778..2a1cd6520 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -21,6 +21,8 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' + # German-speaking countries of Europe + _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] _TESTS = [{ # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', From 2f483758bc6a6661f1215c38161ee626d90ab655 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 11 Sep 2017 04:32:35 +0700 Subject: [PATCH 043/337] [animeondemand] Improve and modernize --- youtube_dl/extractor/animeondemand.py | 34 +++++++++++++-------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 2a1cd6520..69d363311 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,16 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, ExtractorError, - sanitized_Request, urlencode_postdata, + urljoin, ) @@ -78,14 +75,13 @@ class AnimeOnDemandIE(InfoExtractor): 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', self._LOGIN_URL) + post_url = urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), headers={ + 'Referer': self._LOGIN_URL, + }) if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( @@ -154,17 +150,19 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), + item_id_list = [] + if format_id: + item_id_list.append(format_id) + item_id_list.append('videomaterial') + playlist = self._download_json( + urljoin(url, playlist_url), video_id, + 'Downloading %s JSON' % ' '.join(item_id_list), headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-CSRF-Token': csrf_token, 'Referer': url, 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) - playlist = self._download_json( - request, video_id, 'Downloading %s playlist JSON' % format_id, - fatal=False) + }, fatal=False) if not playlist: continue stream_url = playlist.get('streamurl') @@ -246,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor): f.update({ 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), + 'url': urljoin(url, m.group('href')), }) entries.append(f) From e7c3e33456e155106ad08347d8ab9a2ecd0c8a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Sep 2017 23:19:53 +0700 Subject: [PATCH 044/337] [downloader/fragment] Restart inconsistent incomplete fragment downloads (#13731) --- youtube_dl/downloader/fragment.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bccc8ecc1..6f6fb4a77 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -151,10 +151,15 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) + if ctx['fragment_index'] > 0 and resume_len == 0: + self.report_error( + 'Inconsistent state of incomplete fragment download. ' + 'Restarting from the beginning...') + ctx['fragment_index'] = resume_len = 0 + self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) - if ctx['fragment_index'] > 0: - assert resume_len > 0 + assert ctx['fragment_index'] == 0 dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) From 319fc70676fea19b71437aab4078d4d72cc1ba5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Sep 2017 23:50:19 +0700 Subject: [PATCH 045/337] [tv4] Relax _VALID_URL (closes #14206) --- youtube_dl/extractor/tv4.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 7aeb2c620..6487039e5 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -18,7 +18,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^\?]+)\?video_id=| + (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -63,6 +63,10 @@ class TV4IE(InfoExtractor): 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', 'only_matching': True, }, + { + 'url': ' http://www.tv4play.se/program/farang/3922081', + 'only_matching': True, + } ] def _real_extract(self, url): From 0732a90579091ad60b124fd693bdda8ee526e305 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 14 Sep 2017 20:37:46 +0200 Subject: [PATCH 046/337] [orf] Add new extractor for f4m stories --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/orf.py | 116 +++++++++++++++++++++++++++-- 2 files changed, 112 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aefadc56f..a3a97e940 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -768,6 +768,7 @@ from .ora import OraTVIE from .orf import ( ORFTVthekIE, ORFFM4IE, + ORFFM4StoryIE, ORFOE1IE, ORFIPTVIE, ) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index cc296eabd..74fe8017e 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -6,14 +6,15 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - HEADRequest, - unified_strdate, - strip_jsonp, - int_or_none, - float_or_none, determine_ext, + float_or_none, + HEADRequest, + int_or_none, + orderedSet, remove_end, + strip_jsonp, unescapeHTML, + unified_strdate, ) @@ -307,3 +308,108 @@ class ORFIPTVIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, } + + +class ORFFM4StoryIE(InfoExtractor): + IE_NAME = 'orf:fm4:story' + IE_DESC = 'fm4.orf.at stories' + _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)' + + _TEST = { + 'url': 'http://fm4.orf.at/stories/2865738/', + 'playlist': [{ + 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', + 'info_dict': { + 'id': '547792', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + 'duration': 1748.52, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + }, + }, { + 'md5': 'c6dd2179731f86f4f55a7b49899d515f', + 'info_dict': { + 'id': '547798', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live (2)', + 'duration': 1504.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + }, + }], + } + + def _real_extract(self, url): + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + + entries = [] + all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) + for idx, video_id in enumerate(all_ids): + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['q8c'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') + if idx >= 1: + # Titles are duplicates, make them unique + title += ' (' + str(idx + 1) + ')' + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + entries.append({ + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + + return self.playlist_result(entries) From fad9fc537d8ab5141f402f6ccdf60161a8d0302a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 14 Sep 2017 20:47:23 +0200 Subject: [PATCH 047/337] [tv4] fix a test URL --- youtube_dl/extractor/tv4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 6487039e5..cfcce020a 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -64,7 +64,7 @@ class TV4IE(InfoExtractor): 'only_matching': True, }, { - 'url': ' http://www.tv4play.se/program/farang/3922081', + 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, } ] From c46680fb2a0ee61faa25863d3c10b7b098cdbe67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 01:59:47 +0700 Subject: [PATCH 048/337] [condenast] Fix extraction (closes #14196, closes #14207) --- youtube_dl/extractor/condenast.py | 54 ++++++++++++++++++------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 0c3f0c0e4..ed278fefc 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -116,16 +116,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video_params(self, webpage): - query = {} - params = self._search_regex( - r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) - if params: - query.update({ - 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), - 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), - 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), - }) + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) else: params = extract_attributes(self._search_regex( r'(<[^>]+data-js="video-player"[^>]+>)', @@ -141,17 +141,27 @@ class CondeNastIE(InfoExtractor): video_id = params['videoId'] video_info = None - if params.get('playerId'): - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=params) - if info_page: - video_info = info_page.get('video') - if not video_info: - info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=params) - else: + + # New API path + query = params.copy() + query['embedType'] = 'inline' + info_page = self._download_json( + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + if not video_info: info_page = self._download_webpage( 'https://player.cnevids.com/inline/video/%s.js' % video_id, video_id, 'Downloading inline info', query={ @@ -215,7 +225,7 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage) + params = self._extract_video_params(webpage, display_id) info = self._search_json_ld( webpage, display_id, fatal=False) info.update(self._extract_video(params)) From 86e55e317cb70f07792cfe543186ad520cbe3230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 21:45:18 +0700 Subject: [PATCH 049/337] [ChangeLog] Actualize --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index c286da6c6..38d511362 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version <unreleased> + +Core +* [downloader/fragment] Restart inconsistent incomplete fragment downloads + (#13731) +* [YoutubeDL] Download raw subtitles files (#12909, #14191) + +Extractors +* [condenast] Fix extraction (#14196, #14207) ++ [orf] Add support for f4m stories +* [tv4] Relax URL regular expression (#14206) +* [animeondemand] Bypass geo restriction ++ [animeondemand] Add support for flash videos (#9944) + + version 2017.09.11 Extractors From 159d304a9fa2b91d91a60fe3bdf2211a59bcf346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 21:48:06 +0700 Subject: [PATCH 050/337] release 2017.09.15 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f40cb2c4e..98ab5b6ca 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.11** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.15*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.15** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.09.11 +[debug] youtube-dl version 2017.09.15 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 38d511362..041dfd7b9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.09.15 Core * [downloader/fragment] Restart inconsistent incomplete fragment downloads diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 798a81d3c..6b01dc910 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -593,6 +593,7 @@ - **Openload** - **OraTV** - **orf:fm4**: radio FM4 + - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cdcb32e06..8399c04fe 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.09.11' +__version__ = '2017.09.15' From cbf85239bbb835162725cd4c8758831ca1003445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 22:13:30 +0700 Subject: [PATCH 051/337] [vgtv] Relax _VALID_URL (closes #14223) --- youtube_dl/extractor/vgtv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 0f8c156a7..c21a09c01 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -42,7 +42,7 @@ class VGTVIE(XstreamIE): ) /? (?: - \#!/(?:video|live)/| + (?:\#!/)?(?:video|live)/| embed?.*id=| articles/ )| @@ -146,7 +146,11 @@ class VGTVIE(XstreamIE): { 'url': 'abtv:140026', 'only_matching': True, - } + }, + { + 'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', + 'only_matching': True, + }, ] def _real_extract(self, url): From b763e1d68c6becc414a802a452f5aa819c5de920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 22:18:38 +0700 Subject: [PATCH 052/337] [twitch] Add support for go.twitch.tv URLs (closes #14215) --- youtube_dl/extractor/twitch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 2daf9dfac..c926c99a9 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -28,7 +28,7 @@ from ..utils import ( class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' + _VALID_URL_BASE = r'https?://(?:(?:www|go)\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' @@ -217,7 +217,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go)\.)?twitch\.tv/(?:[^/]+/v|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P<id>\d+) @@ -458,7 +458,7 @@ class TwitchStreamIE(TwitchBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/| + (?:(?:www|go)\.)?twitch\.tv/| player\.twitch\.tv/\?.*?\bchannel= ) (?P<id>[^/#?]+) @@ -489,6 +489,9 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'https://player.twitch.tv/?channel=lotsofs', 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/food', + 'only_matching': True, }] @classmethod From 6be44a50edfe2e75e31553e7a128ce1849301958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 22:25:38 +0700 Subject: [PATCH 053/337] [dailymotion:playlist] Relax _VALID_URL (closes #14219) --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 74e991331..e9d0dd19c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -325,7 +325,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' _TESTS = [{ From a4245acef85ac2414e77cf2cda4cb39adb617241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 15 Sep 2017 23:12:19 +0700 Subject: [PATCH 054/337] [noovo] Fix extraction (closes #14214) --- youtube_dl/extractor/noovo.py | 61 ++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index f7fa098a5..974de3c3e 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + js_to_json, smuggle_url, try_get, ) @@ -24,8 +25,6 @@ class NoovoIE(InfoExtractor): 'timestamp': 1491399228, 'upload_date': '20170405', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': 'RPM+', }, 'params': { @@ -37,13 +36,11 @@ class NoovoIE(InfoExtractor): 'info_dict': { 'id': '5395865725001', 'title': 'Épisode 13 : Les retrouvailles', - 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'description': 'md5:888c3330f0c1b4476c5bc99a1c040473', 'ext': 'mp4', 'timestamp': 1492019320, 'upload_date': '20170412', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': "L'amour est dans le pré", 'season_number': 5, 'episode': 'Épisode 13', @@ -58,40 +55,46 @@ class NoovoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, - video_id)['data'] + webpage = self._download_webpage(url, video_id) - content = try_get(data, lambda x: x['contents'][0]) + bc_url = BrightcoveNewIE._extract_url(self, webpage) - brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + data = self._parse_json( + self._search_regex( + r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + + title = try_get( + data, lambda x: x['video']['nom'], + compat_str) or self._html_search_meta( + 'dcterms.Title', webpage, 'title', fatal=True) + + description = self._html_search_meta( + ('dcterms.Description', 'description'), webpage, 'description') series = try_get( - data, ( - lambda x: x['show']['title'], - lambda x: x['season']['show']['title']), - compat_str) + data, lambda x: x['emission']['nom']) or self._search_regex( + r'<div[^>]+class="banner-card__subtitle h4"[^>]*>([^<]+)', + webpage, 'series', default=None) - episode = None - og = data.get('og') - if isinstance(og, dict) and og.get('type') == 'video.episode': - episode = og.get('title') + season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} + season = try_get(season_el, lambda x: x['nom'], compat_str) + season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) - video = content or data + episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} + episode = try_get(episode_el, lambda x: x['nom'], compat_str) + episode_number = int_or_none(try_get(episode_el, lambda x: x['numero'])) return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['CA']}), - 'id': brightcove_id, - 'title': video.get('title'), - 'creator': video.get('source'), - 'view_count': int_or_none(video.get('viewsCount')), + 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'title': title, + 'description': description, 'series': series, - 'season_number': int_or_none(try_get( - data, lambda x: x['season']['seasonNumber'])), + 'season': season, + 'season_number': season_number, 'episode': episode, - 'episode_number': int_or_none(data.get('episodeNumber')), + 'episode_number': episode_number, } From 68d43a61b552007a718894967b869c0f1d8ff00f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 16 Sep 2017 12:14:48 +0800 Subject: [PATCH 055/337] Ignore TTML subtitles --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a5b585f43..fbf7cecb2 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ cover/ updates_key.pem *.egg-info *.srt +*.ttml *.sbv *.vtt *.flv From 3869028ffb6be6ab719e5cf1004276dfdfd1216d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 16 Sep 2017 12:18:38 +0800 Subject: [PATCH 056/337] [utils] Use bytes-like objects in dfxp2srt This fixes handling of non-UTF8 TTML subtitles Closes #14191 --- ChangeLog | 6 ++++++ test/test_utils.py | 26 +++++++++++++++++++++++--- youtube_dl/postprocessor/ffmpeg.py | 2 +- youtube_dl/utils.py | 18 +++++++++++------- 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 041dfd7b9..ba9260e3e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Core +* [utils] Fix handling raw TTML subtitles (#14191) + + version 2017.09.15 Core diff --git a/test/test_utils.py b/test/test_utils.py index e50f3764e..efa73d0f4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1064,7 +1064,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') <p begin="3" dur="-1">Ignored, three</p> </div> </body> - </tt>''' + </tt>'''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The following line contains Chinese characters and special symbols @@ -1089,7 +1089,7 @@ Line <p begin="0" end="1">The first line</p> </div> </body> - </tt>''' + </tt>'''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The first line @@ -1115,7 +1115,7 @@ The first line <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p> </div> </body> -</tt>''' +</tt>'''.encode('utf-8') srt_data = '''1 00:00:02,080 --> 00:00:05,839 <font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font> @@ -1138,6 +1138,26 @@ part 3</font></u> ''' self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data) + dfxp_data_non_utf8 = '''<?xml version="1.0" encoding="UTF-16"?> + <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter"> + <body> + <div xml:lang="en"> + <p begin="0" end="1">Line 1</p> + <p begin="1" end="2">第二行</p> + </div> + </body> + </tt>'''.encode('utf-16') + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +Line 1 + +2 +00:00:01,000 --> 00:00:02,000 +第二行 + +''' + self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data) + def test_cli_option(self): self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 51256a3fb..f71d413b5 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): dfxp_file = old_file srt_file = subtitles_filename(filename, lang, 'srt') - with io.open(dfxp_file, 'rt', encoding='utf-8') as f: + with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) with io.open(srt_file, 'wt', encoding='utf-8') as f: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9e4492d40..b724e0b70 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' LEGACY_NAMESPACES = ( - ('http://www.w3.org/ns/ttml', [ - 'http://www.w3.org/2004/11/ttaf1', - 'http://www.w3.org/2006/04/ttaf1', - 'http://www.w3.org/2006/10/ttaf1', + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', ]), - ('http://www.w3.org/ns/ttml#styling', [ - 'http://www.w3.org/ns/ttml#style', + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', ]), ) @@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data): for ns in v: dfxp_data = dfxp_data.replace(ns, k) - dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') From 790d379e4df2f85ece7cab02e805643234bb5c16 Mon Sep 17 00:00:00 2001 From: Windom <windom@users.noreply.github.com> Date: Sat, 16 Sep 2017 18:39:46 +0300 Subject: [PATCH 057/337] [morningstar] Relax _VALID_URL --- youtube_dl/extractor/morningstar.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py index 320d27bdd..0093bcd6c 100644 --- a/youtube_dl/extractor/morningstar.py +++ b/youtube_dl/extractor/morningstar.py @@ -8,8 +8,8 @@ from .common import InfoExtractor class MorningstarIE(InfoExtractor): IE_DESC = 'morningstar.com' - _VALID_URL = r'https?://(?:www\.)?morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', 'info_dict': { @@ -19,7 +19,10 @@ class MorningstarIE(InfoExtractor): 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' } - } + }, { + 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 8251af63a12cd73cf2578c81dbb869232da2592c Mon Sep 17 00:00:00 2001 From: Vijay Singh <sudovijay@users.noreply.github.com> Date: Sat, 16 Sep 2017 21:15:23 +0530 Subject: [PATCH 058/337] [viki] Update app data (closes #14181) --- youtube_dl/extractor/viki.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index e9c8bf824..853e5c75f 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -23,9 +23,9 @@ class VikiBaseIE(InfoExtractor): _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' - _APP = '65535a' + _APP = '100005a' _APP_VERSION = '2.2.5.1428709186' - _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False _NETRC_MACHINE = 'viki' From 4ed2d7b7d1f67e499a46e507d957616e364565ca Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 17 Sep 2017 13:53:04 +0800 Subject: [PATCH 059/337] Fix flake8 issues after #14225 --- youtube_dl/extractor/common.py | 2 +- youtube_dl/utils.py | 33 +++++++++++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 317a9a76f..2bbbf8f4d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2452,7 +2452,7 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, port, not port is None, domain, True, + 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b724e0b70..acc4f987b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3830,23 +3830,23 @@ def cookie_to_dict(cookie): cookie_dict = { 'name': cookie.name, 'value': cookie.value, - }; + } if cookie.port_specified: cookie_dict['port'] = cookie.port if cookie.domain_specified: cookie_dict['domain'] = cookie.domain if cookie.path_specified: cookie_dict['path'] = cookie.path - if not cookie.expires is None: + if cookie.expires is not None: cookie_dict['expires'] = cookie.expires - if not cookie.secure is None: + if cookie.secure is not None: cookie_dict['secure'] = cookie.secure - if not cookie.discard is None: + if cookie.discard is not None: cookie_dict['discard'] = cookie.discard try: if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): + cookie.has_nonstandard_attr('httponly') or + cookie.has_nonstandard_attr('HttpOnly')): cookie_dict['httponly'] = True except TypeError: pass @@ -3957,7 +3957,7 @@ class PhantomJSwrapper(object): cookies = json.loads(f.read().decode('utf-8')) for cookie in cookies: if cookie['httponly'] is True: - cookie['rest'] = { 'httpOnly': None } + cookie['rest'] = {'httpOnly': None} if 'expiry' in cookie: cookie['expire_time'] = cookie['expiry'] self.extractor._set_cookie(**cookie) @@ -3965,7 +3965,7 @@ class PhantomJSwrapper(object): def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ Downloads webpage (if needed) and executes JS - + Params: url: website url html: optional, html code of website @@ -3974,11 +3974,11 @@ class PhantomJSwrapper(object): note2: optional, displayed when executing JS headers: custom http headers jscode: code to be executed when page is loaded - + Returns tuple with: * downloaded website (after JS execution) * anything you print with `console.log` (but not inside `page.execute`!) - + In most cases you don't need to add any `jscode`. It is executed in `page.onLoadFinished`. `saveAndExit();` is mandatory, use it instead of `phantom.exit()` @@ -3992,7 +3992,7 @@ class PhantomJSwrapper(object): else window.setTimeout(check, 500); } - + page.evaluate(function(){ document.querySelector('#a').click(); }); @@ -4024,13 +4024,14 @@ class PhantomJSwrapper(object): else: self.extractor.to_screen('%s: %s' % (video_id, note2)) - p = subprocess.Popen([self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen([ + self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() if p.returncode != 0: - raise ExtractorError('Executing JS failed\n:' - + encodeArgument(err)) + raise ExtractorError( + 'Executing JS failed\n:' + encodeArgument(err)) with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') From 9c2a17f2ce7b2b9dc45b603be413a943f6637498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 17 Sep 2017 22:19:57 +0700 Subject: [PATCH 060/337] [popcorntv] Add extractor (closes #5914, closes #14211) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/popcorntv.py | 78 ++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/popcorntv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a3a97e940..ab95c8575 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -808,6 +808,7 @@ from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, ) +from .popcorntv import PopcornTVIE from .porn91 import Porn91IE from .porncom import PornComIE from .pornflip import PornFlipIE diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py new file mode 100644 index 000000000..ac901f426 --- /dev/null +++ b/youtube_dl/extractor/popcorntv.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P<display_id>[^/]+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(<link[^>]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r'<h1[^>]+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)<article[^>]+itemprop=["\']description[^>]*>(.+?)</article>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + print(self._html_search_meta( + 'duration', webpage)) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } From 4d8c4b46d5668387cc685123f80fb64bbc7c5aff Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Sun, 17 Sep 2017 15:46:52 +0000 Subject: [PATCH 061/337] [heise] Add support for YouTube embeds --- youtube_dl/extractor/heise.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 382f32771..495ffb7dc 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( determine_ext, int_or_none, @@ -25,6 +26,22 @@ class HeiseIE(InfoExtractor): 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', 'thumbnail': r're:^https?://.*/gallery/$', } + }, { + # YouTube embed + 'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', + 'md5': 'e403d2b43fea8e405e88e3f8623909f1', + 'info_dict': { + 'id': '6kmWbXleKW4', + 'ext': 'mp4', + 'title': 'NEU IM SEPTEMBER | Netflix', + 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'upload_date': '20170830', + 'uploader': 'Netflix Deutschland, Österreich und Schweiz', + 'uploader_id': 'netflixdach', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -40,6 +57,16 @@ class HeiseIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('fulltitle', webpage, default=None) + if not title or title == "c't": + title = self._search_regex( + r'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title') + + yt_urls = YoutubeIE._extract_urls(webpage) + if yt_urls: + return self.playlist_from_matches(yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + container_id = self._search_regex( r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', webpage, 'container ID') @@ -47,12 +74,6 @@ class HeiseIE(InfoExtractor): r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', webpage, 'sequenz ID') - title = self._html_search_meta('fulltitle', webpage, default=None) - if not title or title == "c't": - title = self._search_regex( - r'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') - doc = self._download_xml( 'http://www.heise.de/videout/feed', video_id, query={ 'container': container_id, From 8a1a60d17397721620e75d83f2aad3a353286f15 Mon Sep 17 00:00:00 2001 From: Kareem Moussa <itstehkman@gmail.com> Date: Tue, 19 Sep 2017 08:51:20 -0700 Subject: [PATCH 062/337] [devscripts/check-porn] Fix gettestcases import --- devscripts/check-porn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 7a219ebe9..72b2ee422 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -14,7 +14,7 @@ import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_testcases +from test.helper import gettestcases from youtube_dl.utils import compat_urllib_parse_urlparse from youtube_dl.utils import compat_urllib_request @@ -24,7 +24,7 @@ if len(sys.argv) > 1: else: METHOD = 'EURISTIC' -for test in get_testcases(): +for test in gettestcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() From dc76eef092dc10f7e3f599fa7d85a04de8d84b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 19 Sep 2017 23:59:36 +0700 Subject: [PATCH 063/337] [tvplay] Bypass geo restriction --- youtube_dl/extractor/tvplay.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 99ff82a5d..46132eda1 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -15,7 +15,9 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + smuggle_url, try_get, + unsmuggle_url, update_url_query, ) @@ -224,6 +226,9 @@ class TVPlayIE(InfoExtractor): ] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, @@ -426,4 +431,9 @@ class ViafreeIE(InfoExtractor): r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', webpage, 'video id') - return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) + return self.url_result( + smuggle_url( + 'mtg:%s' % video_id, + {'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]]}), + ie=TVPlayIE.ie_key(), video_id=video_id) From 3b65a6fbf31256030ff35210a7be2b50369a6c4f Mon Sep 17 00:00:00 2001 From: capital-G <dennis.scheiba@gmx.de> Date: Tue, 19 Sep 2017 22:58:06 +0200 Subject: [PATCH 064/337] [twitter] Fix duration extraction --- youtube_dl/extractor/twitter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6eaf360a6..7399cf538 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -229,7 +229,7 @@ class TwitterCardIE(TwitterBaseIE): title = self._search_regex(r'<title>([^<]+)', webpage, 'title') thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration')) or duration + duration = float_or_none(config.get('duration'), scale=1000) or duration return { 'id': video_id, @@ -255,6 +255,7 @@ class TwitterIE(InfoExtractor): 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', + 'duration': 12.922, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -305,11 +306,12 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'あかさ - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'あかさ on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'Donte', + 'uploader': 'あかさ', 'uploader_id': 'jaydingeer', + 'duration': 30.0, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -337,6 +339,7 @@ class TwitterIE(InfoExtractor): 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', 'uploader_id': 'captainamerica', 'uploader': 'Captain America', + 'duration': 3.17, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -364,6 +367,7 @@ class TwitterIE(InfoExtractor): 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', 'uploader': 'عالم الأخبار', 'uploader_id': 'news_al3alm', + 'duration': 277.4, }, 'params': { 'format': 'best[format_id^=http-]', From 12ea5c79fb0bfa878d62d130cf67057fc230dfa7 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Wed, 20 Sep 2017 14:53:06 -0500 Subject: [PATCH 065/337] [nbcsports:vplayer] Correct theplatform URL (closes #13873) --- youtube_dl/extractor/nbc.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 62db70b43..836a41f06 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -109,10 +109,10 @@ class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', 'info_dict': { 'id': '9CsDKds0kvHI', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', 'timestamp': 1426270238, @@ -120,7 +120,7 @@ class NBCSportsVPlayerIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', } }, { - 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, }] @@ -134,7 +134,8 @@ class NBCSportsVPlayerIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage) + theplatform_url = self._og_search_video_url(webpage).replace( + 'vplayer.nbcsports.com', 'player.theplatform.com') return self.url_result(theplatform_url, 'ThePlatform') From f6ff52b473c9ed969fadb3e3d50852c4a27ba17e Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano Date: Wed, 20 Sep 2017 23:05:33 +0200 Subject: [PATCH 066/337] [beeg] Fix extraction (closes #14275) --- youtube_dl/extractor/beeg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index d5c5822f2..bbeae4bac 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -9,6 +9,7 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, + urljoin, ) @@ -36,9 +37,11 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) cpl_url = self._search_regex( - r']+src=(["\'])(?P(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1', + r']+src=(["\'])(?P(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', webpage, 'cpl', default=None, group='url') + cpl_url = urljoin(url, cpl_url) + beeg_version, beeg_salt = [None] * 2 if cpl_url: @@ -54,7 +57,7 @@ class BeegIE(InfoExtractor): r'beeg_salt\s*=\s*(["\'])(?P.+?)\1', cpl, 'beeg salt', default=None, group='beeg_salt') - beeg_version = beeg_version or '2000' + beeg_version = beeg_version or '2185' beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' video = self._download_json( From 8c6919e4331e1cd44f50e700e8fc4e630d913a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Sep 2017 23:00:35 +0700 Subject: [PATCH 067/337] [lynda] Add support for educourse.ga (closes #14286) --- youtube_dl/extractor/lynda.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index d2f75296a..1b6f5091d 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,7 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?lynda\.com/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:lynda\.com|educourse\.ga)/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' @@ -110,6 +110,9 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', 'only_matching': True, + }, { + 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -253,7 +256,7 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 8c2895305dc09920055611c8120f5a65fcd2614f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 23 Sep 2017 02:30:03 +0800 Subject: [PATCH 068/337] [options] Accept lrc as a subtitle conversion target format (closes #14292) --- ChangeLog | 1 + youtube_dl/__init__.py | 2 +- youtube_dl/options.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index ba9260e3e..42ba879b1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Core ++ [options] Accept lrc as a subtitle conversion target format (#14292) * [utils] Fix handling raw TTML subtitles (#14191) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c4589411e..ba684a075 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -206,7 +206,7 @@ def _real_main(argv=None): if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') if opts.convertsubtitles is not None: - if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: parser.error('invalid subtitle format specified') if opts.date is not None: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 38439c971..4c0455044 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -847,7 +847,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--convert-subs', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, - help='Convert the subtitles to other format (currently supported: srt|ass|vtt)') + help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)') parser.add_option_group(general) parser.add_option_group(network) From 2384f5a64e501d7abb844e8d31fe340b34d8d4e7 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Wed, 6 Sep 2017 11:24:34 +0900 Subject: [PATCH 069/337] [mixcloud] Fix extraction (closes #14088) --- youtube_dl/compat.py | 10 +- youtube_dl/extractor/mixcloud.py | 157 ++++++++++++++++++++----------- 2 files changed, 109 insertions(+), 58 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e4e13bcf..2a62248ef 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -6,6 +6,7 @@ import collections import email import getpass import io +import itertools import optparse import os import re @@ -15,7 +16,6 @@ import socket import struct import subprocess import sys -import itertools import xml.etree.ElementTree @@ -2898,6 +2898,13 @@ else: compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack +try: + from future_builtins import zip as compat_zip +except ImportError: # not 2.6+ or is 3.x + try: + from itertools import izip as compat_zip # < 2.5 or 3.x + except ImportError: + compat_zip = zip __all__ = [ 'compat_HTMLParseError', @@ -2948,5 +2955,6 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', + 'compat_zip', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index f6360cce6..481182380 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,16 +9,16 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, - compat_str, compat_urllib_parse_unquote, compat_urlparse, + compat_zip ) from ..utils import ( clean_html, ExtractorError, OnDemandPagedList, str_to_int, -) + try_get) class MixcloudIE(InfoExtractor): @@ -54,27 +54,19 @@ class MixcloudIE(InfoExtractor): 'only_matching': True, }] - _keys = [ - 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };', - 'pleasedontdownloadourmusictheartistswontgetpaid', - 'window.addEventListener = window.addEventListener || function() {};', - '(function() { return new Date().toLocaleDateString(); })()' - ] - _current_key = None + @staticmethod + def _decrypt_xor_cipher(key, ciphertext): + """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(k)) + for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) - # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js - def _decrypt_play_info(self, play_info, video_id): - play_info = base64.b64decode(play_info.encode('ascii')) - for num, key in enumerate(self._keys, start=1): - try: - return self._parse_json( - ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) - for idx, ch in enumerate(play_info)]), - video_id) - except ExtractorError: - if num == len(self._keys): - raise + @staticmethod + def _decrypt_and_extend(stream_info, url_key, getter, key, formats): + maybe_url = stream_info.get(url_key) + if maybe_url is not None: + decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url)) + formats.extend(getter(decrypted)) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -84,54 +76,105 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) - if not self._current_key: - js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', - webpage, 'js url', default=None) - if js_url: - js = self._download_webpage(js_url, track_id, fatal=False) - if js: - KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' - for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): - key = self._search_regex( - KEY_RE_TEMPLATE % key_name, js, 'key', - default=None, group='key') - if key and isinstance(key, compat_str): - self._keys.insert(0, key) - self._current_key = key + # Legacy path + encrypted_play_info = self._search_regex( + r'm-play-info="([^"]+)"', webpage, 'play info', default=None) + + if encrypted_play_info is not None: + # Decode + encrypted_play_info = base64.b64decode(encrypted_play_info) + else: + # New path + full_info_json = self._parse_json(self._html_search_regex( + r'', webpage, 'play info'), 'play info') + for item in full_info_json: + item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) + if try_get(item_data, lambda x: x['streamInfo']['url']): + info_json = item_data + break + else: + raise ExtractorError('Failed to extract matching stream info') message = self._html_search_regex( r'(?s)]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) - encrypted_play_info = self._search_regex( - r'm-play-info="([^"]+)"', webpage, 'play info') + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', + webpage, 'js url', default=None) + if js_url is None: + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id) + # Known plaintext attack + if encrypted_play_info: + kps = ['{"stream_url":'] + kpa_target = encrypted_play_info + else: + kps = ['https://', 'http://'] + kpa_target = base64.b64decode(info_json['streamInfo']['url']) + for kp in kps: + partial_key = self._decrypt_xor_cipher(kpa_target, kp) + for quote in ["'", '"']: + key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, + "encryption key", default=None) + if key is not None: + break + else: + continue + break + else: + raise ExtractorError('Failed to extract encryption key') - play_info = self._decrypt_play_info(encrypted_play_info, track_id) + if encrypted_play_info is not None: + play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') + if message and 'stream_url' not in play_info: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + song_url = play_info['stream_url'] + formats = [{ + 'format_id': 'normal', + 'url': song_url + }] - if message and 'stream_url' not in play_info: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') + thumbnail = self._proto_relative_url(self._html_search_regex( + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) + uploader = self._html_search_regex( + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) + uploader_id = self._search_regex( + r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) + description = self._og_search_description(webpage) + view_count = str_to_int(self._search_regex( + [r'([0-9,.]+)', + r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], + webpage, 'play count', default=None)) - song_url = play_info['stream_url'] + else: + title = info_json['name'] + thumbnail = try_get(info_json, + lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot']) + uploader = try_get(info_json, lambda x: x['owner']['displayName']) + uploader_id = try_get(info_json, lambda x: x['owner']['username']) + description = try_get(info_json, lambda x: x['description']) + view_count = try_get(info_json, lambda x: x['plays']) - title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') - thumbnail = self._proto_relative_url(self._html_search_regex( - r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) - uploader = self._html_search_regex( - r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) - uploader_id = self._search_regex( - r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) - description = self._og_search_description(webpage) - view_count = str_to_int(self._search_regex( - [r'([0-9,.]+)', - r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], - webpage, 'play count', default=None)) + stream_info = info_json['streamInfo'] + formats = [] + self._decrypt_and_extend(stream_info, 'url', lambda x: [{ + 'format_id': 'normal', + 'url': x + }], key, formats) + self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key, + formats) + self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key, + formats) return { 'id': track_id, 'title': title, - 'url': song_url, + 'formats': formats, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, From 095774e59130c999ed8ce132f80a7164c5ee39a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 05:35:55 +0700 Subject: [PATCH 070/337] [mixcloud] Improve and simplify (closes #14132) --- youtube_dl/extractor/mixcloud.py | 71 ++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 481182380..f331db890 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, + compat_str, compat_urllib_parse_unquote, compat_urlparse, compat_zip @@ -16,9 +17,12 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + int_or_none, OnDemandPagedList, str_to_int, - try_get) + try_get, + urljoin, +) class MixcloudIE(InfoExtractor): @@ -61,13 +65,6 @@ class MixcloudIE(InfoExtractor): compat_chr(compat_ord(ch) ^ compat_ord(k)) for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) - @staticmethod - def _decrypt_and_extend(stream_info, url_key, getter, key, formats): - maybe_url = stream_info.get(url_key) - if maybe_url is not None: - decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url)) - formats.extend(getter(decrypted)) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) @@ -86,9 +83,12 @@ class MixcloudIE(InfoExtractor): else: # New path full_info_json = self._parse_json(self._html_search_regex( - r'', webpage, 'play info'), 'play info') + r'', + webpage, 'play info'), 'play info') for item in full_info_json: - item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) + item_data = try_get( + item, lambda x: x['cloudcast']['data']['cloudcastLookup'], + dict) if try_get(item_data, lambda x: x['streamInfo']['url']): info_json = item_data break @@ -100,13 +100,9 @@ class MixcloudIE(InfoExtractor): webpage, 'error message', default=None) js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', - webpage, 'js url', default=None) - if js_url is None: - js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)', - webpage, 'js url') - js = self._download_webpage(js_url, track_id) + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id, 'Downloading JS') # Known plaintext attack if encrypted_play_info: kps = ['{"stream_url":'] @@ -117,8 +113,9 @@ class MixcloudIE(InfoExtractor): for kp in kps: partial_key = self._decrypt_xor_cipher(kpa_target, kp) for quote in ["'", '"']: - key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, - "encryption key", default=None) + key = self._search_regex( + r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), + js, 'encryption key', default=None) if key is not None: break else: @@ -153,23 +150,37 @@ class MixcloudIE(InfoExtractor): else: title = info_json['name'] - thumbnail = try_get(info_json, - lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot']) + thumbnail = urljoin( + 'https://thumbnailer.mixcloud.com/unsafe/600x600/', + try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str)) uploader = try_get(info_json, lambda x: x['owner']['displayName']) uploader_id = try_get(info_json, lambda x: x['owner']['username']) description = try_get(info_json, lambda x: x['description']) - view_count = try_get(info_json, lambda x: x['plays']) + view_count = int_or_none(try_get(info_json, lambda x: x['plays'])) stream_info = info_json['streamInfo'] formats = [] - self._decrypt_and_extend(stream_info, 'url', lambda x: [{ - 'format_id': 'normal', - 'url': x - }], key, formats) - self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key, - formats) - self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key, - formats) + + for url_key in ('url', 'hlsUrl', 'dashUrl'): + format_url = stream_info.get(url_key) + if not format_url: + continue + decrypted = self._decrypt_xor_cipher(key, base64.b64decode(format_url)) + if not decrypted: + continue + if url_key == 'hlsUrl': + formats.extend(self._extract_m3u8_formats( + decrypted, track_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif url_key == 'dashUrl': + formats.extend(self._extract_mpd_formats( + decrypted, track_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': decrypted, + }) + self._sort_formats(formats) return { 'id': track_id, From 9ce1ac404648142139f8b231c674d434ad4f9ffe Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Fri, 22 Sep 2017 22:49:48 +0000 Subject: [PATCH 071/337] [generic] Fix support for multiple HTML5 videos on one page (closes #14080) --- youtube_dl/extractor/generic.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b83c18380..7d0edf09c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1879,6 +1879,15 @@ class GenericIE(InfoExtractor): 'title': 'Building A Business Online: Principal Chairs Q & A', }, }, + { + # multiple HTML5 videos on one page + 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', + 'info_dict': { + 'id': 'keyscenarios', + 'title': 'Rescue Kit 14 Free Edition - Getting started', + }, + 'playlist_count': 4, + } # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2849,13 +2858,20 @@ class GenericIE(InfoExtractor): # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: - for entry in entries: - entry.update({ + if len(entries) == 1: + entries[0].update({ 'id': video_id, 'title': video_title, }) + else: + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': '%s-%s' % (video_id, num), + 'title': '%s (%d)' % (video_title, num), + }) + for entry in entries: self._sort_formats(entry['formats']) - return self.playlist_result(entries) + return self.playlist_result(entries, video_id, video_title) jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) From 13de91c9e92bd831fee38fddbdabce7f6e82ef91 Mon Sep 17 00:00:00 2001 From: Dan Weber Date: Tue, 12 Sep 2017 22:52:54 -0400 Subject: [PATCH 072/337] [americastestkitchen] Add extractor (closes #10764) --- youtube_dl/extractor/americastestkitchen.py | 85 +++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 86 insertions(+) create mode 100755 youtube_dl/extractor/americastestkitchen.py diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py new file mode 100755 index 000000000..f231e7f6e --- /dev/null +++ b/youtube_dl/extractor/americastestkitchen.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class AmericasTestKitchenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/episode/(?P\d+)' + _TESTS = [{ + 'url': + 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'md5': 'b861c3e365ac38ad319cfd509c30577f', + 'info_dict': { + 'id': '1_5g5zua6e', + 'title': 'atk_s17_e24.mp4', + 'ext': 'mp4', + 'description': '

Host Julia Collin Davison goes into the test kitchen with test cook Dan Souza to learn how to make the ultimate Grill-Roasted Beef Tenderloin. Next, equipment expert Adam Ried reviews gas grills in the Equipment Corner. Then, gadget guru Lisa McManus uncovers the best quirky gadgets. Finally, test cook Erin McMurrer shows host Bridget Lancaster how to make an elegant Pear-Walnut Upside-Down Cake.

', + 'timestamp': 1497285541, + 'upload_date': '20170612', + 'uploader_id': 'roger.metcalf@americastestkitchen.com', + 'release_date': '2017-06-17', + 'thumbnail': 'http://d3cizcpymoenau.cloudfront.net/images/35973/e24-tenderloin-16.jpg', + 'episode_number': 24, + 'episode': 'Summer Dinner Party', + 'episode_id': '548-summer-dinner-party', + 'season_number': 17 + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': + 'https://www.americastestkitchen.com/episode/546-a-spanish-affair', + 'only_matching': + True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + partner_id = self._search_regex( + r'partner_id/(?P\d+)', + webpage, + 'partner_id', + group='partner_id') + + video_data = self._parse_json( + self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?});\s*', + webpage, 'initial context'), + video_id) + + episode_data = video_data['episodeDetail']['content']['data'] + episode_content_meta = episode_data['full_video'] + external_id = episode_content_meta['external_id'] + + # photo data + photo_data = episode_content_meta.get('photo') + thumbnail = photo_data.get('image_url') if photo_data else None + + # meta + release_date = episode_data.get('aired_at') + description = episode_content_meta.get('description') + episode_number = int(episode_content_meta.get('episode_number')) + episode = episode_content_meta.get('title') + episode_id = episode_content_meta.get('episode_slug') + season_number = int(episode_content_meta.get('season_number')) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, external_id), + 'ie_key': 'Kaltura', + 'id': video_id, + 'release_date': release_date, + 'thumbnail': thumbnail, + 'description': description, + 'episode_number': episode_number, + 'episode': episode, + 'episode_id': episode_id, + 'season_number': season_number + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ab95c8575..585300500 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -39,6 +39,7 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE +from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anvato import AnvatoIE From 4bb58fa118a8c75b2ecf05f7b29a0ae27eef6239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 06:28:46 +0700 Subject: [PATCH 073/337] [americastestkitchen] Improve (closes #13996) --- youtube_dl/extractor/americastestkitchen.py | 82 ++++++++++----------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index f231e7f6e..01736872d 100755 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,85 +1,85 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + try_get, + unified_strdate, +) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/episode/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' _TESTS = [{ - 'url': - 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'url': 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '1_5g5zua6e', - 'title': 'atk_s17_e24.mp4', + 'title': 'Summer Dinner Party', 'ext': 'mp4', - 'description': '

Host Julia Collin Davison goes into the test kitchen with test cook Dan Souza to learn how to make the ultimate Grill-Roasted Beef Tenderloin. Next, equipment expert Adam Ried reviews gas grills in the Equipment Corner. Then, gadget guru Lisa McManus uncovers the best quirky gadgets. Finally, test cook Erin McMurrer shows host Bridget Lancaster how to make an elegant Pear-Walnut Upside-Down Cake.

', + 'description': 'md5:858d986e73a4826979b6a5d9f8f6a1ec', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1497285541, 'upload_date': '20170612', 'uploader_id': 'roger.metcalf@americastestkitchen.com', - 'release_date': '2017-06-17', - 'thumbnail': 'http://d3cizcpymoenau.cloudfront.net/images/35973/e24-tenderloin-16.jpg', - 'episode_number': 24, + 'release_date': '20170617', + 'series': "America's Test Kitchen", + 'season_number': 17, 'episode': 'Summer Dinner Party', - 'episode_id': '548-summer-dinner-party', - 'season_number': 17 + 'episode_number': 24, }, 'params': { - # m3u8 download 'skip_download': True, }, }, { - 'url': - 'https://www.americastestkitchen.com/episode/546-a-spanish-affair', - 'only_matching': - True, + 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) partner_id = self._search_regex( - r'partner_id/(?P\d+)', - webpage, - 'partner_id', - group='partner_id') + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') video_data = self._parse_json( self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?});\s*', + r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', webpage, 'initial context'), video_id) - episode_data = video_data['episodeDetail']['content']['data'] - episode_content_meta = episode_data['full_video'] - external_id = episode_content_meta['external_id'] + ep_data = try_get( + video_data, + (lambda x: x['episodeDetail']['content']['data'], + lambda x: x['videoDetail']['content']['data']), dict) + ep_meta = ep_data.get('full_video', {}) + external_id = ep_data.get('external_id') or ep_meta['external_id'] - # photo data - photo_data = episode_content_meta.get('photo') - thumbnail = photo_data.get('image_url') if photo_data else None + title = ep_data.get('title') or ep_meta.get('title') + description = clean_html(ep_meta.get('episode_description') or ep_data.get( + 'description') or ep_meta.get('description')) + thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) + release_date = unified_strdate(ep_data.get('aired_at')) - # meta - release_date = episode_data.get('aired_at') - description = episode_content_meta.get('description') - episode_number = int(episode_content_meta.get('episode_number')) - episode = episode_content_meta.get('title') - episode_id = episode_content_meta.get('episode_slug') - season_number = int(episode_content_meta.get('season_number')) + season_number = int_or_none(ep_meta.get('season_number')) + episode = ep_meta.get('title') + episode_number = int_or_none(ep_meta.get('episode_number')) return { '_type': 'url_transparent', 'url': 'kaltura:%s:%s' % (partner_id, external_id), 'ie_key': 'Kaltura', - 'id': video_id, - 'release_date': release_date, - 'thumbnail': thumbnail, + 'title': title, 'description': description, - 'episode_number': episode_number, + 'thumbnail': thumbnail, + 'release_date': release_date, + 'series': "America's Test Kitchen", + 'season_number': season_number, 'episode': episode, - 'episode_id': episode_id, - 'season_number': season_number + 'episode_number': episode_number, } From 5c1452e8f1e744db14be1baef840e9f531e8f144 Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano Date: Sat, 23 Sep 2017 01:38:09 +0200 Subject: [PATCH 074/337] [twitter] Add support for user_id-less URLs (closes #14270) --- youtube_dl/extractor/twitter.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 7399cf538..0df3ad7c7 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -242,8 +242,9 @@ class TwitterCardIE(TwitterBaseIE): class TwitterIE(InfoExtractor): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P[^/]+))/status/(?P\d+)' _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' + _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -322,9 +323,9 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'FilmDrunk - Vine of the day', - 'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', - 'uploader': 'FilmDrunk', + 'title': 'Vince Mancini - Vine of the day', + 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', + 'uploader': 'Vince Mancini', 'uploader_id': 'Filmdrunk', 'timestamp': 1402826626, 'upload_date': '20140615', @@ -372,6 +373,21 @@ class TwitterIE(InfoExtractor): 'params': { 'format': 'best[format_id^=http-]', }, + }, { + 'url': 'https://twitter.com/i/web/status/910031516746514432', + 'info_dict': { + 'id': '910031516746514432', + 'ext': 'mp4', + 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'uploader': 'Préfet de Guadeloupe', + 'uploader_id': 'Prefet971', + 'duration': 47.48, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): @@ -380,11 +396,15 @@ class TwitterIE(InfoExtractor): twid = mobj.group('id') webpage, urlh = self._download_webpage_handle( - self._TEMPLATE_URL % (user_id, twid), twid) + self._TEMPLATE_STATUSES_URL % twid, twid) if 'twitter.com/account/suspended' in urlh.geturl(): raise ExtractorError('Account suspended by Twitter.', expected=True) + if user_id is None: + mobj = re.match(self._VALID_URL, urlh.geturl()) + user_id = mobj.group('user_id') + username = remove_end(self._og_search_title(webpage), ' on Twitter') title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') From 1c22d7a7f30917abfd2b7495f7bd02d51cb8528a Mon Sep 17 00:00:00 2001 From: Namnamseo <0201ssw+github@gmail.com> Date: Thu, 24 Aug 2017 11:32:24 +0900 Subject: [PATCH 075/337] [kakao] Add extractor (closes #12298) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kakao.py | 140 +++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 youtube_dl/extractor/kakao.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 585300500..4232a4fef 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -483,6 +483,7 @@ from .jove import JoveIE from .joj import JojIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE +from .kakao import KakaoIE from .kaltura import KalturaIE from .kamcord import KamcordIE from .kanalplay import KanalPlayIE diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py new file mode 100644 index 000000000..0caa41e9e --- /dev/null +++ b/youtube_dl/extractor/kakao.py @@ -0,0 +1,140 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + compat_str, + unified_timestamp, +) + + +class KakaoIE(InfoExtractor): + _VALID_URL = r'https?://tv.kakao.com/channel/(?P\d+)/cliplink/(?P\d+)' + IE_NAME = 'kakao.com' + + _TESTS = [{ + 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', + 'md5': '702b2fbdeb51ad82f5c904e8c0766340', + 'info_dict': { + 'id': '301965083', + 'ext': 'mp4', + 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', + 'uploader_id': 2671005, + 'uploader': '그랑그랑이', + 'timestamp': 1488160199, + 'upload_date': '20170227', + } + }, { + 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': '300103180', + 'ext': 'mp4', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'uploader_id': 2653210, + 'uploader': '쇼 음악중심', + 'timestamp': 1485684628, + 'upload_date': '20170129', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_url = 'http://tv.kakao.com/embed/player/cliplink/' + video_id + \ + '?service=kakao_tv&autoplay=1&profile=HIGH&wmode=transparent' + player_header = {'Referer': player_url} + + impress = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/impress' % video_id, + video_id, 'Downloading video info', + query={ + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + 'fields': 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' + }, headers=player_header) + + clipLink = impress['clipLink'] + clip = clipLink['clip'] + + video_info = { + 'id': video_id, + 'title': clip['title'], + 'description': clip.get('description'), + 'uploader': clipLink.get('channel', {}).get('name'), + 'uploader_id': clipLink.get('channelId'), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + } + + tid = impress.get('tid', '') + raw = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw' % video_id, + video_id, 'Downloading video formats info', + query={ + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'tid': tid, + 'profile': 'HIGH', + 'dteType': 'PC', + }, headers=player_header, fatal=False) + + formats = [] + for fmt in raw.get('outputList', []): + try: + profile_name = fmt['profile'] + fmt_url_json = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw/videolocation' % video_id, + video_id, 'Downloading video URL for profile %s' % profile_name, + query={ + 'service': 'kakao_tv', + 'section': '', + 'tid': tid, + 'profile': profile_name + }, headers=player_header, fatal=False) + + if fmt_url_json is None: + continue + + fmt_url = fmt_url_json['url'] + formats.append({ + 'url': fmt_url, + 'format_id': profile_name, + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + 'format_note': fmt.get('label'), + 'filesize': int_or_none(fmt.get('filesize')) + }) + except KeyError: + pass + + self._sort_formats(formats) + video_info['formats'] = formats + + top_thumbnail = clip.get('thumbnailUrl') + thumbs = [] + for thumb in clip.get('clipChapterThumbnailList', []): + thumbs.append({ + 'url': thumb.get('thumbnailUrl'), + 'id': compat_str(thumb.get('timeInSec')), + 'preference': -1 if thumb.get('isDefault') else 0 + }) + video_info['thumbnail'] = top_thumbnail + video_info['thumbnails'] = thumbs + + upload_date = unified_timestamp(clipLink.get('createTime')) + video_info['timestamp'] = upload_date + + return video_info From f70ddd4aebbfb0bdf2f63c1eba5b5614d2cfb70f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:25:15 +0700 Subject: [PATCH 076/337] [kakao] Improve (closes #14007) --- youtube_dl/extractor/kakao.py | 117 ++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 0caa41e9e..c9b438efc 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -3,16 +3,17 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, - compat_str, unified_timestamp, + update_url_query, ) class KakaoIE(InfoExtractor): _VALID_URL = r'https?://tv.kakao.com/channel/(?P\d+)/cliplink/(?P\d+)' - IE_NAME = 'kakao.com' + _API_BASE = 'http://tv.kakao.com/api/v1/ft/cliplinks' _TESTS = [{ 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', @@ -44,60 +45,57 @@ class KakaoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - player_url = 'http://tv.kakao.com/embed/player/cliplink/' + video_id + \ - '?service=kakao_tv&autoplay=1&profile=HIGH&wmode=transparent' - player_header = {'Referer': player_url} - - impress = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/impress' % video_id, - video_id, 'Downloading video info', - query={ - 'player': 'monet_html5', - 'referer': url, - 'uuid': '', - 'service': 'kakao_tv', - 'section': '', - 'dteType': 'PC', - 'fields': 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' - }, headers=player_header) - - clipLink = impress['clipLink'] - clip = clipLink['clip'] - - video_info = { - 'id': video_id, - 'title': clip['title'], - 'description': clip.get('description'), - 'uploader': clipLink.get('channel', {}).get('name'), - 'uploader_id': clipLink.get('channelId'), - 'duration': int_or_none(clip.get('duration')), - 'view_count': int_or_none(clip.get('playCount')), - 'like_count': int_or_none(clip.get('likeCount')), - 'comment_count': int_or_none(clip.get('commentCount')), + player_header = { + 'Referer': update_url_query( + 'http://tv.kakao.com/embed/player/cliplink/%s' % video_id, { + 'service': 'kakao_tv', + 'autoplay': '1', + 'profile': 'HIGH', + 'wmode': 'transparent', + }) } + QUERY_COMMON = { + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + } + + query = QUERY_COMMON.copy() + query['fields'] = 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' + impress = self._download_json( + '%s/%s/impress' % (self._API_BASE, video_id), + video_id, 'Downloading video info', + query=query, headers=player_header) + + clip_link = impress['clipLink'] + clip = clip_link['clip'] + + title = clip.get('title') or clip_link.get('displayTitle') + tid = impress.get('tid', '') + + query = QUERY_COMMON.copy() + query.update({ + 'tid': tid, + 'profile': 'HIGH', + }) raw = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw' % video_id, + '%s/%s/raw' % (self._API_BASE, video_id), video_id, 'Downloading video formats info', - query={ - 'player': 'monet_html5', - 'referer': url, - 'uuid': '', - 'service': 'kakao_tv', - 'section': '', - 'tid': tid, - 'profile': 'HIGH', - 'dteType': 'PC', - }, headers=player_header, fatal=False) + query=query, headers=player_header) formats = [] for fmt in raw.get('outputList', []): try: profile_name = fmt['profile'] fmt_url_json = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw/videolocation' % video_id, - video_id, 'Downloading video URL for profile %s' % profile_name, + '%s/%s/raw/videolocation' % (self._API_BASE, video_id), + video_id, + 'Downloading video URL for profile %s' % profile_name, query={ 'service': 'kakao_tv', 'section': '', @@ -119,11 +117,8 @@ class KakaoIE(InfoExtractor): }) except KeyError: pass - self._sort_formats(formats) - video_info['formats'] = formats - top_thumbnail = clip.get('thumbnailUrl') thumbs = [] for thumb in clip.get('clipChapterThumbnailList', []): thumbs.append({ @@ -131,10 +126,24 @@ class KakaoIE(InfoExtractor): 'id': compat_str(thumb.get('timeInSec')), 'preference': -1 if thumb.get('isDefault') else 0 }) - video_info['thumbnail'] = top_thumbnail - video_info['thumbnails'] = thumbs + top_thumbnail = clip.get('thumbnailUrl') + if top_thumbnail: + thumbs.append({ + 'url': top_thumbnail, + 'preference': 10, + }) - upload_date = unified_timestamp(clipLink.get('createTime')) - video_info['timestamp'] = upload_date - - return video_info + return { + 'id': video_id, + 'title': title, + 'description': clip.get('description'), + 'uploader': clip_link.get('channel', {}).get('name'), + 'uploader_id': clip_link.get('channelId'), + 'thumbnails': thumbs, + 'timestamp': unified_timestamp(clip_link.get('createTime')), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + 'formats': formats, + } From 7f4921b38d10c17fe354bab20b741b362c5ae0aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:26:40 +0700 Subject: [PATCH 077/337] [heise] PEP 8 --- youtube_dl/extractor/heise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 495ffb7dc..82e11a7d8 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -60,8 +60,8 @@ class HeiseIE(InfoExtractor): title = self._html_search_meta('fulltitle', webpage, default=None) if not title or title == "c't": title = self._search_regex( - r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') + r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title') yt_urls = YoutubeIE._extract_urls(webpage) if yt_urls: From 136507b39a2b48cb775249e9724eeeedb56baed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:41:22 +0700 Subject: [PATCH 078/337] [24video] Add support for 24video.adult (closes #14295) --- youtube_dl/extractor/twentyfourvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 7af11659f..cc51ca0ae 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -14,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube|adult))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', From e3440d824a7326d0ba609d8f0896203208ecc558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:42:17 +0700 Subject: [PATCH 079/337] [24video] Fix timestamp extraction and make non fatal (#14295) --- youtube_dl/extractor/twentyfourvideo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index cc51ca0ae..96e0b96e3 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -60,8 +60,8 @@ class TwentyFourVideoIE(InfoExtractor): duration = int_or_none(self._og_search_property( 'duration', webpage, 'duration', fatal=False)) timestamp = parse_iso8601(self._search_regex( - r'