From dd867805969126ed0bd2ab8fe69eaf61fbf44ab7 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 11 Feb 2016 10:55:50 +0100 Subject: [PATCH 001/508] [extractor/common] fix dash formats sorting --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 00645feed..cd7087bec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -851,6 +851,7 @@ class InfoExtractor(object): proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 if f.get('vcodec') == 'none': # audio only + preference -= 50 if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: @@ -861,6 +862,8 @@ class InfoExtractor(object): except ValueError: audio_ext_preference = -1 else: + if f.get('acodec') == 'none': # video only + preference -= 40 if self._downloader.params.get('prefer_free_formats'): ORDER = ['flv', 'mp4', 'webm'] else: From 199e72429106375218902102812e26c2fc6624b5 Mon Sep 17 00:00:00 2001 From: mutantmonkey Date: Mon, 15 Feb 2016 17:30:53 -0800 Subject: [PATCH 002/508] [KUSI] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/kusi.py | 61 ++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/kusi.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 537d25777..bfc2008be 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -338,6 +338,7 @@ from .konserthusetplay import KonserthusetPlayIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE from .ku6 import Ku6IE +from .kusi import KUSIIE from .kuwo import ( KuwoIE, KuwoAlbumIE, diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py new file mode 100644 index 000000000..20407411b --- /dev/null +++ b/youtube_dl/extractor/kusi.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import int_or_none + + +class KUSIIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?kusi\.com/(?Pstory/.+|video\?clipId=(?P\d+))' + _TEST = { + 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', + 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', + 'info_dict': { + 'id': '12203019', + 'ext': 'mp4', + 'title': 'Turko Files: Case Closed! & Put On Hold!', + 'duration': 231000, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + if mobj.group('clipId') is not None: + video_id = mobj.group('clipId') + else: + webpage = self._download_webpage(url, mobj.group('path')) + video_id = self._html_search_regex(r'"clipId", "(\d+)"', webpage, + 'clipId') + + xml_url = 'http://www.kusi.com/build.asp?buildtype=buildfeaturexml'\ + 'request&featureType=Clip&featureid={0}&affiliateno=956&'\ + 'clientgroupid=1&rnd=562461'.format(video_id) + doc = self._download_xml(xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + video_title = doc.find('HEADLINE').text + duration = int_or_none(doc.find('DURATION'), get_attr='text') + description = doc.find('ABSTRACT') + + quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') + formats = [] + for quality in quality_options: + if 'height' in quality.attrib: + formats.append({ + 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), + 'height': quality.attrib['height'], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'description': description, + 'duration': duration, + 'formats': formats, + } From 12b84ac8c13754baeeead907d8c9d239141f8706 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 19 Feb 2016 19:29:24 +0100 Subject: [PATCH 003/508] [downloader/external] Add FFmpegFD(fixes #622) - replace HlsFD and RtspFD - add basic support for downloading part of the video or audio --- youtube_dl/downloader/__init__.py | 20 +++++---- youtube_dl/downloader/external.py | 65 +++++++++++++++++++++++++++ youtube_dl/downloader/hls.py | 74 ++----------------------------- youtube_dl/downloader/rtsp.py | 45 ------------------- 4 files changed, 80 insertions(+), 124 deletions(-) delete mode 100644 youtube_dl/downloader/rtsp.py diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index dccc59212..bb6afb1f8 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,14 +1,15 @@ from __future__ import unicode_literals from .common import FileDownloader -from .external import get_external_downloader from .f4m import F4mFD from .hls import HlsFD -from .hls import NativeHlsFD from .http import HttpFD -from .rtsp import RtspFD from .rtmp import RtmpFD from .dash import DashSegmentsFD +from .external import ( + get_external_downloader, + FFmpegFD, +) from ..utils import ( determine_protocol, @@ -16,10 +17,10 @@ from ..utils import ( PROTOCOL_MAP = { 'rtmp': RtmpFD, - 'm3u8_native': NativeHlsFD, - 'm3u8': HlsFD, - 'mms': RtspFD, - 'rtsp': RtspFD, + 'm3u8_native': HlsFD, + 'm3u8': FFmpegFD, + 'mms': FFmpegFD, + 'rtsp': FFmpegFD, 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, } @@ -30,6 +31,9 @@ def get_suitable_downloader(info_dict, params={}): protocol = determine_protocol(info_dict) info_dict['protocol'] = protocol + if (info_dict.get('start_time') or info_dict.get('end_time')) and FFmpegFD.supports(info_dict): + return FFmpegFD + external_downloader = params.get('external_downloader') if external_downloader is not None: ed = get_external_downloader(external_downloader) @@ -37,7 +41,7 @@ def get_suitable_downloader(info_dict, params={}): return ed if protocol == 'm3u8' and params.get('hls_prefer_native'): - return NativeHlsFD + return HlsFD return PROTOCOL_MAP.get(protocol, HttpFD) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 2bc011266..bb43677b7 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -2,8 +2,12 @@ from __future__ import unicode_literals import os.path import subprocess +import sys +import re from .common import FileDownloader +from ..postprocessor.ffmpeg import FFmpegPostProcessor +from ..compat import compat_str from ..utils import ( cli_option, cli_valueless_option, @@ -11,6 +15,7 @@ from ..utils import ( cli_configuration_args, encodeFilename, encodeArgument, + handle_youtubedl_headers, ) @@ -136,6 +141,66 @@ class HttpieFD(ExternalFD): cmd += ['%s:%s' % (key, val)] return cmd + +class FFmpegFD(ExternalFD): + @classmethod + def supports(cls, info_dict): + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + + def _call_downloader(self, tmpfilename, info_dict): + url = info_dict['url'] + ffpp = FFmpegPostProcessor(downloader=self) + ffpp.check_version() + + args = [ffpp.executable, '-y'] + + start_time = info_dict.get('start_time', 0) + if start_time: + args += ['-ss', compat_str(start_time)] + end_time = info_dict.get('end_time') + if end_time: + args += ['-t', compat_str(end_time - start_time)] + + if info_dict['http_headers'] and re.match(r'^https?://', url): + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) + args += [ + '-headers', + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + + args += ['-i', url, '-c', 'copy'] + if info_dict.get('protocol') == 'm3u8': + if self.params.get('hls_use_mpegts', False): + args += ['-f', 'mpegts'] + else: + args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + else: + args += ['-f', info_dict['ext']] + + args = [encodeArgument(opt) for opt in args] + args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + + self._debug_cmd(args) + + proc = subprocess.Popen(args, stdin=subprocess.PIPE) + try: + retval = proc.wait() + except KeyboardInterrupt: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/rg3/youtube-dl/issues/8300). + if sys.platform != 'win32': + proc.communicate(b'q') + raise + return retval + + +class AVconvFD(FFmpegFD): + pass + _BY_NAME = dict( (klass.get_basename(), klass) for name, klass in globals().items() diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 2a775bf00..a01dac031 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -1,87 +1,19 @@ from __future__ import unicode_literals -import os +import os.path import re -import subprocess -import sys -from .common import FileDownloader from .fragment import FragmentFD from ..compat import compat_urlparse -from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( - encodeArgument, encodeFilename, sanitize_open, - handle_youtubedl_headers, ) -class HlsFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - ffpp = FFmpegPostProcessor(downloader=self) - if not ffpp.available: - self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') - return False - ffpp.check_version() - - args = [ffpp.executable, '-y'] - - if info_dict['http_headers'] and re.match(r'^https?://', url): - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - headers = handle_youtubedl_headers(info_dict['http_headers']) - args += [ - '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] - - args += ['-i', url, '-c', 'copy'] - if self.params.get('hls_use_mpegts', False): - args += ['-f', 'mpegts'] - else: - args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - - args = [encodeArgument(opt) for opt in args] - args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) - - self._debug_cmd(args) - - proc = subprocess.Popen(args, stdin=subprocess.PIPE) - try: - retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/rg3/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') - raise - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % (ffpp.basename, retval)) - return False - - -class NativeHlsFD(FragmentFD): - """ A more limited implementation that does not require ffmpeg """ +class HlsFD(FragmentFD): + """ A limited implementation that does not require ffmpeg """ FD_NAME = 'hlsnative' diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py deleted file mode 100644 index 3eb29526c..000000000 --- a/youtube_dl/downloader/rtsp.py +++ /dev/null @@ -1,45 +0,0 @@ -from __future__ import unicode_literals - -import os -import subprocess - -from .common import FileDownloader -from ..utils import ( - check_executable, - encodeFilename, -) - - -class RtspFD(FileDownloader): - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) - - if check_executable('mplayer', ['-h']): - args = [ - 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', - '-dumpstream', '-dumpfile', tmpfilename, url] - elif check_executable('mpv', ['-h']): - args = [ - 'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url] - else: - self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') - return False - - retval = subprocess.call(args) - if retval == 0: - fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) - self.try_rename(tmpfilename, filename) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - }) - return True - else: - self.to_stderr('\n') - self.report_error('%s exited with code %d' % (args[0], retval)) - return False From 99cbe98ce8617c119c2fb6a567b0e6ef7eae8859 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 20 Feb 2016 07:58:25 +0100 Subject: [PATCH 004/508] [downloader/external] check for external downloaders availability --- youtube_dl/downloader/__init__.py | 4 ++-- youtube_dl/downloader/external.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index bb6afb1f8..67c2840a5 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -31,13 +31,13 @@ def get_suitable_downloader(info_dict, params={}): protocol = determine_protocol(info_dict) info_dict['protocol'] = protocol - if (info_dict.get('start_time') or info_dict.get('end_time')) and FFmpegFD.supports(info_dict): + if (info_dict.get('start_time') or info_dict.get('end_time')) and FFmpegFD.available() and FFmpegFD.supports(info_dict): return FFmpegFD external_downloader = params.get('external_downloader') if external_downloader is not None: ed = get_external_downloader(external_downloader) - if ed.supports(info_dict): + if ed.available() and ed.supports(info_dict): return ed if protocol == 'm3u8' and params.get('hls_prefer_native'): diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index bb43677b7..edf85483b 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -16,6 +16,7 @@ from ..utils import ( encodeFilename, encodeArgument, handle_youtubedl_headers, + check_executable, ) @@ -50,6 +51,10 @@ class ExternalFD(FileDownloader): def exe(self): return self.params.get('external_downloader') + @classmethod + def available(cls): + return check_executable(cls.get_basename(), cls.available_opt) + @classmethod def supports(cls, info_dict): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') @@ -81,6 +86,8 @@ class ExternalFD(FileDownloader): class CurlFD(ExternalFD): + available_opt = ['-V'] + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): @@ -94,6 +101,8 @@ class CurlFD(ExternalFD): class AxelFD(ExternalFD): + available_opt = ['-V'] + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): @@ -104,6 +113,8 @@ class AxelFD(ExternalFD): class WgetFD(ExternalFD): + available_opt = ['--version'] + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): @@ -117,6 +128,8 @@ class WgetFD(ExternalFD): class Aria2cFD(ExternalFD): + available_opt = ['-v'] + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c'] cmd += self._configuration_args([ @@ -135,6 +148,10 @@ class Aria2cFD(ExternalFD): class HttpieFD(ExternalFD): + @classmethod + def available(cls): + return check_executable('http', ['--version']) + def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] for key, val in info_dict['http_headers'].items(): @@ -147,6 +164,10 @@ class FFmpegFD(ExternalFD): def supports(cls, info_dict): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + @classmethod + def available(cls): + return FFmpegPostProcessor().available + def _call_downloader(self, tmpfilename, info_dict): url = info_dict['url'] ffpp = FFmpegPostProcessor(downloader=self) From f34294fa0c0097cea7f6388d5d691d5a54950491 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 20 Feb 2016 08:06:12 +0100 Subject: [PATCH 005/508] [downloader/external:ffmpegfd] check for None value of start_time --- youtube_dl/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index edf85483b..a4fdf1af8 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -175,7 +175,7 @@ class FFmpegFD(ExternalFD): args = [ffpp.executable, '-y'] - start_time = info_dict.get('start_time', 0) + start_time = info_dict.get('start_time') or 0 if start_time: args += ['-ss', compat_str(start_time)] end_time = info_dict.get('end_time') From 399a76e67bca0beb4849ea90c4f40803fbd06ed3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Feb 2016 03:28:25 +0800 Subject: [PATCH 006/508] [utils] Jython support: tolerate missing fcntl module --- youtube_dl/utils.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a2c6780ca..17747be26 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1217,13 +1217,23 @@ if sys.platform == 'win32': raise OSError('Unlocking file failed: %r' % ctypes.FormatError()) else: - import fcntl + # Some platforms, such as Jython, is missing fcntl + try: + import fcntl - def _lock_file(f, exclusive): - fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) + def _lock_file(f, exclusive): + fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) - def _unlock_file(f): - fcntl.flock(f, fcntl.LOCK_UN) + def _unlock_file(f): + fcntl.flock(f, fcntl.LOCK_UN) + except ImportError: + UNSUPPORTED_MSG = 'file locking is not supported on this platform' + + def _lock_file(f, exclusive): + raise IOError(UNSUPPORTED_MSG) + + def _unlock_file(f): + raise IOError(UNSUPPORTED_MSG) class locked_file(object): From c1c05c67ea6087c3b0190c9f16cb9fdd8160e398 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Feb 2016 03:29:02 +0800 Subject: [PATCH 007/508] [utils] Jython support - disable setproctitle() until ctypes is complete --- youtube_dl/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 17747be26..16b4324a4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1397,6 +1397,12 @@ def fix_xml_ampersands(xml_str): def setproctitle(title): assert isinstance(title, compat_str) + + # ctypes in Jython is not complete + # http://bugs.jython.org/issue2148 + if sys.platform.startswith('java'): + return + try: libc = ctypes.cdll.LoadLibrary('libc.so.6') except OSError: From 101067de12e193c8ad42e1f474c9018eedadec9d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Feb 2016 03:29:49 +0800 Subject: [PATCH 008/508] Jython support - handle *.class files --- .gitignore | 3 ++- Makefile | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0422adf44..26dbde73d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *.pyo +*.class *~ *.DS_Store wine-py2exe/ @@ -32,4 +33,4 @@ test/testdata .tox youtube-dl.zsh .idea -.idea/* \ No newline at end of file +.idea/* diff --git a/Makefile b/Makefile index cb449b7e6..afd7f4032 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas clean: rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete + find . -name "*.class" -delete PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin From 1b77ee6248b2a9967d3d3e809e416bd26fd7342c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 24 Feb 2016 03:32:12 +0800 Subject: [PATCH 009/508] [c56] Support videos hosted on Sohu (closes #8073) --- youtube_dl/extractor/c56.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index cb96c3876..cac8fdcba 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -4,12 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import js_to_json class C56IE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P.+?)\.(?:html|swf)' IE_NAME = '56.com' - _TEST = { + _TESTS = [{ 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', 'md5': 'e59995ac63d0457783ea05f93f12a866', 'info_dict': { @@ -18,12 +19,29 @@ class C56IE(InfoExtractor): 'title': '网事知多少 第32期:车怒', 'duration': 283.813, }, - } + }, { + 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html', + 'md5': '', + 'info_dict': { + 'id': '82247482', + 'title': '爱的诅咒之杜鹃花开', + }, + 'playlist_count': 7, + 'add_ie': ['Sohu'], + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) text_id = mobj.group('textid') + webpage = self._download_webpage(url, text_id) + sohu_video_info_str = self._search_regex( + r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None) + if sohu_video_info_str: + sohu_video_info = self._parse_json( + sohu_video_info_str, text_id, transform_source=js_to_json) + return self.url_result(sohu_video_info['url'], 'Sohu') + page = self._download_json( 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') From c24883a1c0e650a41b54e7cdec8427b046c27bc1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 24 Feb 2016 03:43:24 +0800 Subject: [PATCH 010/508] [facebook] Fix format sorting 'hd' formats should have higher priorities --- youtube_dl/extractor/facebook.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index d2ff38140..6c6c3b1bd 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -212,10 +212,13 @@ class FacebookIE(InfoExtractor): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: + preference = -10 if format_id == 'progressive' else 0 + if quality == 'hd': + preference += 5 formats.append({ 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'url': src, - 'preference': -10 if format_id == 'progressive' else 0, + 'preference': preference, }) dash_manifest = f[0].get('dash_manifest') if dash_manifest: From c0da50d2b2ba75e5d497caa8805ebac49845f168 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Wed, 24 Feb 2016 23:05:23 +0600 Subject: [PATCH 011/508] [README.md] Turn references to issues to links --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cf03e0d54..d50d4b950 100644 --- a/README.md +++ b/README.md @@ -599,7 +599,7 @@ You can merge the video and audio of two formats into a single file using `-f Date: Thu, 25 Feb 2016 00:36:14 +0600 Subject: [PATCH 012/508] [motherless] Make categories optional (Closes #8654) --- youtube_dl/extractor/motherless.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 97d5da626..b3bfcac9e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -54,6 +54,11 @@ class MotherlessIE(InfoExtractor): 'thumbnail': 're:http://.*\.jpg', 'age_limit': 18, } + }, + { + # no keywords + 'url': 'http://motherless.com/8B4BBC1', + 'only_matching': True, } ] @@ -86,7 +91,7 @@ class MotherlessIE(InfoExtractor): r'"thumb-member-username">\s+ Date: Thu, 25 Feb 2016 00:42:19 +0600 Subject: [PATCH 013/508] [motherless] Detect non-existing videos --- youtube_dl/extractor/motherless.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index b3bfcac9e..30e686a4e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, str_to_int, unified_strdate, ) @@ -66,6 +67,11 @@ class MotherlessIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(p in webpage for p in ( + '404 - MOTHERLESS.COM<', + ">The page you're looking for cannot be found.<")): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') video_url = self._html_search_regex( From f160785c5c5a99a2fdc4724f1f66f423cf8f6bf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Feb 2016 00:52:49 +0600 Subject: [PATCH 014/508] [utils] Remove AM/PM from unified_strdate patterns --- test/test_utils.py | 1 + youtube_dl/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e6887be9f..d0736f435 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -249,6 +249,7 @@ class TestUtil(unittest.TestCase): self.assertEqual( unified_strdate('2/2/2015 6:47:40 PM', day_first=False), '20150202') + self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214') self.assertEqual(unified_strdate('25-09-2014'), '20140925') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8ec70f93c..a3df90fad 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -905,9 +905,9 @@ def unified_strdate(date_str, day_first=True): '%d %b %Y', '%B %d %Y', '%b %d %Y', - '%b %dst %Y %I:%M%p', - '%b %dnd %Y %I:%M%p', - '%b %dth %Y %I:%M%p', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', '%Y %m %d', '%Y-%m-%d', '%Y/%m/%d', From d0459c530d3ce145fe89cb30eba813be6b8ed66a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Feb 2016 00:54:41 +0600 Subject: [PATCH 015/508] [motherless] Update tests --- youtube_dl/extractor/motherless.py | 93 +++++++++++++++--------------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 30e686a4e..0b4787c1d 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -13,55 +13,52 @@ from ..utils import ( class MotherlessIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' - _TESTS = [ - { - 'url': 'http://motherless.com/AC3FFE1', - 'md5': '310f62e325a9fafe64f68c0bccb6e75f', - 'info_dict': { - 'id': 'AC3FFE1', - 'ext': 'mp4', - 'title': 'Fucked in the ass while playing PS3', - 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], - 'upload_date': '20100913', - 'uploader_id': 'famouslyfuckedup', - 'thumbnail': 're:http://.*\.jpg', - 'age_limit': 18, - } - }, - { - 'url': 'http://motherless.com/532291B', - 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', - 'info_dict': { - 'id': '532291B', - 'ext': 'mp4', - 'title': 'Amazing girl playing the omegle game, PERFECT!', - 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'], - 'upload_date': '20140622', - 'uploader_id': 'Sulivana7x', - 'thumbnail': 're:http://.*\.jpg', - 'age_limit': 18, - } - }, - { - 'url': 'http://motherless.com/g/cosplay/633979F', - 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', - 'info_dict': { - 'id': '633979F', - 'ext': 'mp4', - 'title': 'Turtlette', - 'categories': ['superheroine heroine superher'], - 'upload_date': '20140827', - 'uploader_id': 'shade0230', - 'thumbnail': 're:http://.*\.jpg', - 'age_limit': 18, - } - }, - { - # no keywords - 'url': 'http://motherless.com/8B4BBC1', - 'only_matching': True, + _TESTS = [{ + 'url': 'http://motherless.com/AC3FFE1', + 'md5': '310f62e325a9fafe64f68c0bccb6e75f', + 'info_dict': { + 'id': 'AC3FFE1', + 'ext': 'mp4', + 'title': 'Fucked in the ass while playing PS3', + 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], + 'upload_date': '20100913', + 'uploader_id': 'famouslyfuckedup', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, } - ] + }, { + 'url': 'http://motherless.com/532291B', + 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131', + 'info_dict': { + 'id': '532291B', + 'ext': 'mp4', + 'title': 'Amazing girl playing the omegle game, PERFECT!', + 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', + 'game', 'hairy'], + 'upload_date': '20140622', + 'uploader_id': 'Sulivana7x', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + }, + 'skip': '404', + }, { + 'url': 'http://motherless.com/g/cosplay/633979F', + 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', + 'info_dict': { + 'id': '633979F', + 'ext': 'mp4', + 'title': 'Turtlette', + 'categories': ['superheroine heroine superher'], + 'upload_date': '20140827', + 'uploader_id': 'shade0230', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + } + }, { + # no keywords + 'url': 'http://motherless.com/8B4BBC1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 9195ef745aba88a08e351af6405a87b5f355c026 Mon Sep 17 00:00:00 2001 From: mutantmonkey <mutantmonkey@mutantmonkey.in> Date: Mon, 15 Feb 2016 17:07:13 -0800 Subject: [PATCH 016/508] [uStudio] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ustudio.py | 58 ++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 youtube_dl/extractor/ustudio.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1ae606f1e..5386747f1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -813,6 +813,7 @@ from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import UstudioIE from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py new file mode 100644 index 000000000..0f837ad73 --- /dev/null +++ b/youtube_dl/extractor/ustudio.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class UstudioIE(InfoExtractor): + IE_NAME = 'uStudio' + _VALID_URL = r'http://(?:www\.|v1\.)?ustudio.com/video/(?P<id>[\w\d]+)/.+' + _TESTS = [ + { + 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', + 'md5': '58bbfca62125378742df01fc2abbdef6', + 'info_dict': { + 'id': 'Uxu2my9bgSph', + 'ext': 'mp4', + 'title': 'San Francisco: Golden Gate Bridge', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:23925500697f2c6d4830e387ba51a9be', + 'uploader': 'Tony Farley', + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + doc = self._download_xml( + 'http://v1.ustudio.com/embed/{0}/ustudio/config.xml'.format( + video_id), + video_id, + note='Downloading video info', + errnote='Failed to download video info') + + formats = [ + { + 'url': quality.attrib['url'], + 'width': int_or_none(quality.attrib.get('width')), + 'height': int_or_none(quality.attrib.get('height')), + } for quality in doc.findall('./qualities/quality/video') + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + 'description': self._og_search_description(webpage), + 'uploader': self._html_search_regex( + r'<a href=".*/user/.+">(.+)</a> on', + webpage, + 'uploader', + fatal=False), + } From 20108c6b9045f819580bf2d6ca687718bd7b0510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Feb 2016 21:30:19 +0600 Subject: [PATCH 017/508] [ustudio] Improve (Closes #8574) --- youtube_dl/extractor/ustudio.py | 91 ++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py index 0f837ad73..cafc082b6 100644 --- a/youtube_dl/extractor/ustudio.py +++ b/youtube_dl/extractor/ustudio.py @@ -1,58 +1,67 @@ -# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + unified_strdate, +) class UstudioIE(InfoExtractor): - IE_NAME = 'uStudio' - _VALID_URL = r'http://(?:www\.|v1\.)?ustudio.com/video/(?P<id>[\w\d]+)/.+' - _TESTS = [ - { - 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', - 'md5': '58bbfca62125378742df01fc2abbdef6', - 'info_dict': { - 'id': 'Uxu2my9bgSph', - 'ext': 'mp4', - 'title': 'San Francisco: Golden Gate Bridge', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:23925500697f2c6d4830e387ba51a9be', - 'uploader': 'Tony Farley', - } - }, - ] + _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' + _TEST = { + 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', + 'md5': '58bbfca62125378742df01fc2abbdef6', + 'info_dict': { + 'id': 'Uxu2my9bgSph', + 'display_id': 'san_francisco_golden_gate_bridge', + 'ext': 'mp4', + 'title': 'San Francisco: Golden Gate Bridge', + 'description': 'md5:23925500697f2c6d4830e387ba51a9be', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20111107', + 'uploader': 'Tony Farley', + } + } def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') - webpage = self._download_webpage(url, video_id) + config = self._download_xml( + 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, + display_id) - doc = self._download_xml( - 'http://v1.ustudio.com/embed/{0}/ustudio/config.xml'.format( - video_id), - video_id, - note='Downloading video info', - errnote='Failed to download video info') + def extract(kind): + return [{ + 'url': item.attrib['url'], + 'width': int_or_none(item.get('width')), + 'height': int_or_none(item.get('height')), + } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] - formats = [ - { - 'url': quality.attrib['url'], - 'width': int_or_none(quality.attrib.get('width')), - 'height': int_or_none(quality.attrib.get('height')), - } for quality in doc.findall('./qualities/quality/video') - ] + formats = extract('video') self._sort_formats(formats) + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title(webpage) + upload_date = unified_strdate(self._search_regex( + r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>', + webpage, 'upload date', fatal=False)) + uploader = self._search_regex( + r'Uploaded by\s*<a[^>]*>([^<]+)<', + webpage, 'uploader', fatal=False) + return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': formats, + 'display_id': display_id, + 'title': title, 'description': self._og_search_description(webpage), - 'uploader': self._html_search_regex( - r'<a href=".*/user/.+">(.+)</a> on', - webpage, - 'uploader', - fatal=False), + 'thumbnails': extract('image'), + 'upload_date': upload_date, + 'uploader': uploader, + 'formats': formats, } From e26cde092736c3356c1ab76c899a180fb3441d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Feb 2016 21:46:43 +0600 Subject: [PATCH 018/508] [space] Remove extractor (Closes #8662) Now uses ooyala embed --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/space.py | 38 -------------------------------- 2 files changed, 39 deletions(-) delete mode 100644 youtube_dl/extractor/space.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5386747f1..77e2ae425 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -669,7 +669,6 @@ from .southpark import ( SouthParkEsIE, SouthParkNlIE ) -from .space import SpaceIE from .spankbang import SpankBangIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py deleted file mode 100644 index ebb5d6ec0..000000000 --- a/youtube_dl/extractor/space.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..utils import RegexNotFoundError, ExtractorError - - -class SpaceIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' - _TEST = { - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', - 'info_dict': { - 'id': '2780937028001', - 'ext': 'mp4', - 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video', - 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61', - 'uploader': 'TechMedia Networks', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - try: - # Some videos require the playerKey field, which isn't define in - # the BrightcoveExperience object - brightcove_url = self._og_search_video_url(webpage) - except RegexNotFoundError: - # Other videos works fine with the info from the object - brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - if brightcove_url is None: - raise ExtractorError( - 'The webpage does not contain a video', expected=True) - return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key()) From e048d87fc9b5cfa19806649faf20fc1a7bdb82a6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 24 Feb 2016 03:49:13 +0800 Subject: [PATCH 019/508] [kuwo] Fix a test --- youtube_dl/extractor/kuwo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index f641edef8..700e44b63 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -68,6 +68,7 @@ class KuwoIE(KuwoBaseIE): 'id': '6446136', 'ext': 'mp3', 'title': '心', + 'description': 'md5:b2ab6295d014005bfc607525bfc1e38a', 'creator': 'IU', 'upload_date': '20150518', }, From 81bdc8fdf6516b05bc3a26f82eacb1889f5e46d5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 24 Feb 2016 22:08:40 +0800 Subject: [PATCH 020/508] [utils] Move base62 to utils --- youtube_dl/extractor/iqiyi.py | 15 ++------------- youtube_dl/utils.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index c3e33009a..4f02b9f87 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -18,6 +18,7 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + base62, ExtractorError, ohdave_rsa_encrypt, remove_start, @@ -126,21 +127,9 @@ class IqiyiSDK(object): class IqiyiSDKInterpreter(object): - BASE62_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - def __init__(self, sdk_code): self.sdk_code = sdk_code - @classmethod - def base62(cls, num): - if num == 0: - return '0' - ret = '' - while num: - ret = cls.BASE62_TABLE[num % 62] + ret - num = num // 62 - return ret - def decode_eval_codes(self): self.sdk_code = self.sdk_code[5:-3] @@ -154,7 +143,7 @@ class IqiyiSDKInterpreter(object): while count: count -= 1 - b62count = self.base62(count) + b62count = base62(count) symbol_table[b62count] = symbols[count] or b62count self.sdk_code = re.sub( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a3df90fad..d7a1586c0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2619,3 +2619,17 @@ def ohdave_rsa_encrypt(data, exponent, modulus): payload = int(binascii.hexlify(data[::-1]), 16) encrypted = pow(payload, exponent, modulus) return '%x' % encrypted + + +def base_n(num, n, table): + if num == 0: + return '0' + ret = '' + while num: + ret = table[num % n] + ret + num = num // n + return ret + + +def base62(num): + return base_n(num, 62, '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') From d1e440a4a18522207a1a3e624bf801c8338f9146 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 14:13:00 +0800 Subject: [PATCH 021/508] [jwplatform] Separate codes for for parsing jwplayer data --- youtube_dl/extractor/jwplatform.py | 59 ++++++++++++++++-------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 8e90d5986..60a09044f 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -7,33 +7,9 @@ from .common import InfoExtractor from ..utils import int_or_none -class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' - _TEST = { - 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', - 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', - 'info_dict': { - 'id': 'nPripu9l', - 'ext': 'mov', - 'title': 'Big Buck Bunny Trailer', - 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', - 'upload_date': '20081127', - 'timestamp': 1227796140, - } - } - - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', - webpage) - if mobj: - return mobj.group('url') - - def _real_extract(self, url): - video_id = self._match_id(url) - json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) - video_data = json_data['playlist'][0] +class JWPlatformBaseIE(InfoExtractor): + def _parse_jwplayer_data(self, jwplayer_data, video_id): + video_data = jwplayer_data['playlist'][0] subtitles = {} for track in video_data['tracks']: if track['kind'] == 'captions': @@ -68,3 +44,32 @@ class JWPlatformIE(InfoExtractor): 'subtitles': subtitles, 'formats': formats, } + + +class JWPlatformIE(JWPlatformBaseIE): + _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TEST = { + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + return self._parse_jwplayer_data(json_data, video_id) From 481888294d19ef52075e531ce26588d97b3d16d2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 14:23:16 +0800 Subject: [PATCH 022/508] [utils] Add base36 for use in Vidzi --- youtube_dl/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d7a1586c0..be1f3b0d7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2631,5 +2631,9 @@ def base_n(num, n, table): return ret +def base36(num): + return base_n(num, 36, '0123456789abcdefghijklmnopqrstuvwxyz') + + def base62(num): return base_n(num, 62, '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') From 8f4a2124a914207912bf9fc37e593210e8dd423b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 14:26:07 +0800 Subject: [PATCH 023/508] [vidzi] Fix extraction --- youtube_dl/extractor/jwplatform.py | 6 ++--- youtube_dl/extractor/vidzi.py | 40 +++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 60a09044f..6770685d7 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class JWPlatformBaseIE(InfoExtractor): - def _parse_jwplayer_data(self, jwplayer_data, video_id): + def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True): video_data = jwplayer_data['playlist'][0] subtitles = {} for track in video_data['tracks']: @@ -19,7 +19,7 @@ class JWPlatformBaseIE(InfoExtractor): for source in video_data['sources']: source_url = self._proto_relative_url(source['file']) source_type = source.get('type') or '' - if source_type == 'application/vnd.apple.mpegurl': + if source_type in ('application/vnd.apple.mpegurl', 'hls'): formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', 'm3u8_native', fatal=False)) elif source_type.startswith('audio'): @@ -37,7 +37,7 @@ class JWPlatformBaseIE(InfoExtractor): return { 'id': video_id, - 'title': video_data['title'], + 'title': video_data['title'] if require_title else video_data.get('title'), 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 7c6e98026..4ec07db3a 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -1,11 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import smuggle_url +import re + +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + base36, + js_to_json, +) -class VidziIE(InfoExtractor): +class VidziIE(JWPlatformBaseIE): _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' _TEST = { 'url': 'http://vidzi.tv/cghql9yq6emu.html', @@ -14,7 +19,6 @@ class VidziIE(InfoExtractor): 'id': 'cghql9yq6emu', 'ext': 'mp4', 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - 'uploader': 'vidzi.tv', }, 'params': { # m3u8 download @@ -29,11 +33,23 @@ class VidziIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - # Vidzi now uses jwplayer, which can be handled by GenericIE - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(url, {'to_generic': True}), - 'ie_key': 'Generic', - } + mobj = re.search(r"}\('(.+)',36,(\d+),'([^']+)'\.split\('\|'\)", webpage) + code, count, symbols = mobj.groups() + + count = int(count) + symbols = symbols.split('|') + + while count: + count -= 1 + if symbols[count]: + code = re.sub(r'\b%s\b' % base36(count), symbols[count], code) + + code = code.replace('\\\'', '\'') + jwplayer_data = self._parse_json( + self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'), + video_id, transform_source=js_to_json) + + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict['title'] = title + + return info_dict From 59f898b7a72284efb994a8c6baee7771046226dd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 14:37:20 +0800 Subject: [PATCH 024/508] [utils] Merge base_n functions --- youtube_dl/extractor/iqiyi.py | 4 ++-- youtube_dl/extractor/vidzi.py | 4 ++-- youtube_dl/utils.py | 16 +++++++--------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 4f02b9f87..76ecd55a4 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -18,7 +18,7 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( - base62, + base_n, ExtractorError, ohdave_rsa_encrypt, remove_start, @@ -143,7 +143,7 @@ class IqiyiSDKInterpreter(object): while count: count -= 1 - b62count = base62(count) + b62count = base_n(count, 62) symbol_table[b62count] = symbols[count] or b62count self.sdk_code = re.sub( diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 4ec07db3a..d671e27de 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -5,7 +5,7 @@ import re from .jwplatform import JWPlatformBaseIE from ..utils import ( - base36, + base_n, js_to_json, ) @@ -42,7 +42,7 @@ class VidziIE(JWPlatformBaseIE): while count: count -= 1 if symbols[count]: - code = re.sub(r'\b%s\b' % base36(count), symbols[count], code) + code = re.sub(r'\b%s\b' % base_n(count, 36), symbols[count], code) code = code.replace('\\\'', '\'') jwplayer_data = self._parse_json( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index be1f3b0d7..900e07a8e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2621,19 +2621,17 @@ def ohdave_rsa_encrypt(data, exponent, modulus): return '%x' % encrypted -def base_n(num, n, table): +def base_n(num, n, table=None): if num == 0: return '0' + + FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + assert n <= len(FULL_TABLE) + if not table: + table = FULL_TABLE[:n] + ret = '' while num: ret = table[num % n] + ret num = num // n return ret - - -def base36(num): - return base_n(num, 36, '0123456789abcdefghijklmnopqrstuvwxyz') - - -def base62(num): - return base_n(num, 62, '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') From f52354a88927eb66bcb1f603d2d91162b5bd2b5f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 14:58:29 +0800 Subject: [PATCH 025/508] [utils] Move codes for handling eval() from iqiyi.py --- youtube_dl/extractor/iqiyi.py | 24 ++---------------------- youtube_dl/utils.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 76ecd55a4..2b3952210 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -18,7 +18,7 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( - base_n, + decode_packed_codes, ExtractorError, ohdave_rsa_encrypt, remove_start, @@ -130,28 +130,8 @@ class IqiyiSDKInterpreter(object): def __init__(self, sdk_code): self.sdk_code = sdk_code - def decode_eval_codes(self): - self.sdk_code = self.sdk_code[5:-3] - - mobj = re.search( - r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}", - self.sdk_code) - obfucasted_code, count, symbols = mobj.groups() - count = int(count) - symbols = symbols.split('|') - symbol_table = {} - - while count: - count -= 1 - b62count = base_n(count, 62) - symbol_table[b62count] = symbols[count] or b62count - - self.sdk_code = re.sub( - r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], - obfucasted_code) - def run(self, target, ip, timestamp): - self.decode_eval_codes() + self.sdk_code = decode_packed_codes(self.sdk_code) functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 900e07a8e..fc7e2fb7f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2635,3 +2635,23 @@ def base_n(num, n, table=None): ret = table[num % n] + ret num = num // n return ret + + +def decode_packed_codes(code): + mobj = re.search( + r"'([^']+)',(\d+),(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}", + code) + obfucasted_code, base, count, symbols = mobj.groups() + base = int(base) + count = int(count) + symbols = symbols.split('|') + symbol_table = {} + + while count: + count -= 1 + base_n_count = base_n(count, base) + symbol_table[base_n_count] = symbols[count] or base_n_count + + return re.sub( + r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], + obfucasted_code) From e4fc8d2ebedfc6ea7f66ff9146940c73802a2edf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 15:00:48 +0800 Subject: [PATCH 026/508] [videomega] Fix extraction (closes #7606) --- youtube_dl/extractor/videomega.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 5e2e7cbac..4f0dcd18c 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import sanitized_Request +from ..utils import ( + decode_packed_codes, + sanitized_Request, +) class VideoMegaIE(InfoExtractor): - _WORKING = False _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', @@ -42,8 +44,10 @@ class VideoMegaIE(InfoExtractor): r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + + real_codes = decode_packed_codes(webpage) video_url = self._search_regex( - r'<source[^>]+?src="([^"]+)"', webpage, 'video URL') + r'"src"\s*,\s*"([^"]+)"', real_codes, 'video URL') return { 'id': video_id, From 680079be39563a2ff810602413db5245d6cbf148 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 15:13:03 +0800 Subject: [PATCH 027/508] [utils] Relaxing regex in decode_packed_codes for vidzi --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fc7e2fb7f..756ad4fd1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2639,7 +2639,7 @@ def base_n(num, n, table=None): def decode_packed_codes(code): mobj = re.search( - r"'([^']+)',(\d+),(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}", + r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)", code) obfucasted_code, base, count, symbols = mobj.groups() base = int(base) From efbd6fb8bb86c07e6f924a7ec2c4bd486face3a4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Feb 2016 15:13:35 +0800 Subject: [PATCH 028/508] [vidzi] Use decode_packed_codes Javascript codes found on Vidzi are slightly different from those found in VideoMega and iQiyi. Nevertheless, the difference has no effects on the final result. --- youtube_dl/extractor/vidzi.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index d671e27de..3c78fb3d5 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -1,11 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .jwplatform import JWPlatformBaseIE from ..utils import ( - base_n, + decode_packed_codes, js_to_json, ) @@ -33,18 +31,7 @@ class VidziIE(JWPlatformBaseIE): title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - mobj = re.search(r"}\('(.+)',36,(\d+),'([^']+)'\.split\('\|'\)", webpage) - code, count, symbols = mobj.groups() - - count = int(count) - symbols = symbols.split('|') - - while count: - count -= 1 - if symbols[count]: - code = re.sub(r'\b%s\b' % base_n(count, 36), symbols[count], code) - - code = code.replace('\\\'', '\'') + code = decode_packed_codes(webpage).replace('\\\'', '\'') jwplayer_data = self._parse_json( self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'), video_id, transform_source=js_to_json) From b78b292f0c51323edaf3e18ae4f45927a55e9198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 26 Feb 2016 22:21:47 +0600 Subject: [PATCH 029/508] [youtube] Add alternative automatic captions extraction approach (Closes #8667) --- youtube_dl/extractor/youtube.py | 83 ++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e24dd3e5b..ec90c2111 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -975,40 +975,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return {} try: args = player_config['args'] - caption_url = args['ttsurl'] - if not caption_url: - self._downloader.report_warning(err_msg) - return {} - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse.urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') + caption_url = args.get('ttsurl') + if caption_url: + timestamp = args['timestamp'] + # We get the available subtitles + list_params = compat_urllib_parse.urlencode({ + 'type': 'list', + 'tlangs': 1, + 'asrs': 1, + }) + list_url = caption_url + '&' + list_params + caption_list = self._download_xml(list_url, video_id) + original_lang_node = caption_list.find('track') + if original_lang_node is None: + self._downloader.report_warning('Video doesn\'t have automatic captions') + return {} + original_lang = original_lang_node.attrib['lang_code'] + caption_kind = original_lang_node.attrib.get('kind', '') + + sub_lang_list = {} + for lang_node in caption_list.findall('target'): + sub_lang = lang_node.attrib['lang_code'] + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats + return sub_lang_list + + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + parsed_caption_url = compat_urlparse.urlparse(caption_url) + caption_qs = compat_parse_qs(parsed_caption_url.query) sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if not sub_lang: + continue sub_formats = [] for ext in self._SUBTITLE_FORMATS: - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], }) + sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( + query=compat_urllib_parse.urlencode(caption_qs, True))) sub_formats.append({ - 'url': caption_url + '&' + params, + 'url': sub_url, 'ext': ext, }) sub_lang_list[sub_lang] = sub_formats From 2ebd2eac880e21e0fe9751e6cef28ac009f69d79 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 27 Feb 2016 00:57:35 +0800 Subject: [PATCH 030/508] [letv] Speedup M3U8 decryption --- youtube_dl/extractor/letv.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 9665ece89..9fd494c29 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -94,17 +94,16 @@ class LetvIE(InfoExtractor): return encrypted_data encrypted_data = encrypted_data[5:] - _loc4_ = bytearray() - while encrypted_data: - b = compat_ord(encrypted_data[0]) - _loc4_.extend([b // 16, b & 0x0f]) - encrypted_data = encrypted_data[1:] + _loc4_ = bytearray(2 * len(encrypted_data)) + for idx, val in enumerate(encrypted_data): + b = compat_ord(val) + _loc4_[2 * idx] = b // 16 + _loc4_[2 * idx + 1] = b % 16 idx = len(_loc4_) - 11 _loc4_ = _loc4_[idx:] + _loc4_[:idx] - _loc7_ = bytearray() - while _loc4_: - _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) - _loc4_ = _loc4_[2:] + _loc7_ = bytearray(len(encrypted_data)) + for i in range(len(encrypted_data)): + _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1] return bytes(_loc7_) From 4435c6e98eb44b10c306b53c85c32458fbfeac88 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 27 Feb 2016 02:54:43 +0800 Subject: [PATCH 031/508] [bokecc] Add new extractor (#2336) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bokecc.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/bokecc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 77e2ae425..c30679736 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -74,6 +74,7 @@ from .bleacherreport import ( ) from .blinkx import BlinkxIE from .bloomberg import BloombergIE +from .bokecc import BokeCCIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py new file mode 100644 index 000000000..122a1cbb6 --- /dev/null +++ b/youtube_dl/extractor/bokecc.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ExtractorError + + +class BokeCCBaseIE(InfoExtractor): + def _extract_bokecc_formats(self, webpage, video_id, format_id=None): + player_params_str = self._html_search_regex( + r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', + webpage, 'player params') + + player_params = compat_parse_qs(player_params_str) + + info_xml = self._download_xml( + 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( + player_params['siteid'][0], player_params['vid'][0]), video_id) + + formats = [{ + 'format_id': format_id, + 'url': quality.find('./copy').attrib['playurl'], + 'preference': int(quality.attrib['value']), + } for quality in info_xml.findall('./video/quality')] + + self._sort_formats(formats) + + return formats + + +class BokeCCIE(BokeCCBaseIE): + _IE_DESC = 'CC视频' + _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + + _TESTS = [{ + 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', + 'info_dict': { + 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30', + 'ext': 'flv', + 'title': 'BokeCC Video', + }, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) + if not qs.get('vid') or not qs.get('uid'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0]) + + webpage = self._download_webpage(url, video_id) + + return { + 'id': video_id, + 'title': 'BokeCC Video', # no title provided in the webpage + 'formats': self._extract_bokecc_formats(webpage, video_id), + } From 5633b4d39d178402c6d89146c8c9c34e3bf58619 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 27 Feb 2016 02:55:11 +0800 Subject: [PATCH 032/508] [infoq] Use BokeCC extractor function --- youtube_dl/extractor/infoq.py | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 016af2084..cca0b8a93 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,15 +4,12 @@ from __future__ import unicode_literals import base64 -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_parse_qs, -) +from ..compat import compat_urllib_parse_unquote from ..utils import determine_ext +from .bokecc import BokeCCBaseIE -class InfoQIE(InfoExtractor): +class InfoQIE(BokeCCBaseIE): _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)' _TESTS = [{ @@ -38,26 +35,6 @@ class InfoQIE(InfoExtractor): }, }] - def _extract_bokecc_videos(self, webpage, video_id): - # TODO: bokecc.com is a Chinese video cloud platform - # It should have an independent extractor but I don't have other - # examples using bokecc - player_params_str = self._html_search_regex( - r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)', - webpage, 'player params', default=None) - - player_params = compat_parse_qs(player_params_str) - - info_xml = self._download_xml( - 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % ( - player_params['siteid'][0], player_params['vid'][0]), video_id) - - return [{ - 'format_id': 'bokecc', - 'url': quality.find('./copy').attrib['playurl'], - 'preference': int(quality.attrib['value']), - } for quality in info_xml.findall('./video/quality')] - def _extract_rtmp_videos(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -101,7 +78,7 @@ class InfoQIE(InfoExtractor): if '/cn/' in url: # for China videos, HTTP video URL exists but always fails with 403 - formats = self._extract_bokecc_videos(webpage, video_id) + formats = self._extract_bokecc_formats(webpage, video_id) else: formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) From 5eb6bdced4765cdeb70411c6aa93ecb4163a9ffe Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 27 Feb 2016 03:19:50 +0800 Subject: [PATCH 033/508] [utils] Multiple changes to base_n() 1. Renamed to encode_base_n() 2. Allow tables longer than 62 characters 3. Raise ValueError instead of AssertionError for invalid input data 4. Return the first character in the table instead of '0' for number 0 5. Add tests --- test/test_utils.py | 12 ++++++++++++ youtube_dl/utils.py | 14 ++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index d0736f435..97587ad2f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -18,6 +18,7 @@ import xml.etree.ElementTree from youtube_dl.utils import ( age_restricted, args_to_str, + encode_base_n, clean_html, DateRange, detect_exe_version, @@ -802,5 +803,16 @@ The first line ohdave_rsa_encrypt(b'aa111222', e, N), '726664bd9a23fd0c70f9f1b84aab5e3905ce1e45a584e9cbcf9bcc7510338fc1986d6c599ff990d923aa43c51c0d9013cd572e13bc58f4ae48f2ed8c0b0ba881') + def test_encode_base_n(self): + self.assertEqual(encode_base_n(0, 30), '0') + self.assertEqual(encode_base_n(80, 30), '2k') + + custom_table = '9876543210ZYXWVUTSRQPONMLKJIHGFEDCBA' + self.assertEqual(encode_base_n(0, 30, custom_table), '9') + self.assertEqual(encode_base_n(80, 30, custom_table), '7P') + + self.assertRaises(ValueError, encode_base_n, 0, 70) + self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 756ad4fd1..606977c58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2621,15 +2621,17 @@ def ohdave_rsa_encrypt(data, exponent, modulus): return '%x' % encrypted -def base_n(num, n, table=None): - if num == 0: - return '0' - +def encode_base_n(num, n, table=None): FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - assert n <= len(FULL_TABLE) if not table: table = FULL_TABLE[:n] + if n > len(table): + raise ValueError('base %d exceeds table length %d' % (n, len(table))) + + if num == 0: + return table[0] + ret = '' while num: ret = table[num % n] + ret @@ -2649,7 +2651,7 @@ def decode_packed_codes(code): while count: count -= 1 - base_n_count = base_n(count, base) + base_n_count = encode_base_n(count, base) symbol_table[base_n_count] = symbols[count] or base_n_count return re.sub( From fbb6edd298a34cf63a5fd0bd900ca2337038810c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 27 Feb 2016 06:48:13 +0600 Subject: [PATCH 034/508] [extractor/common] Properly extract audio only formats in master m3u8 playlists --- youtube_dl/extractor/common.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14f575635..51351fb57 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1084,19 +1084,29 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, } - codecs = last_info.get('CODECS') - if codecs: - # TODO: looks like video codec is not always necessarily goes first - va_codecs = codecs.split(',') - if va_codecs[0]: - f['vcodec'] = va_codecs[0] - if len(va_codecs) > 1 and va_codecs[1]: - f['acodec'] = va_codecs[1] resolution = last_info.get('RESOLUTION') if resolution: width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) + codecs = last_info.get('CODECS') + if codecs: + vcodec, acodec = [None] * 2 + va_codecs = codecs.split(',') + if len(va_codecs) == 1: + # Audio only entries usually come with single codec and + # no resolution. For more robustness we also check it to + # be mp4 audio. + if not resolution and va_codecs[0].startswith('mp4a'): + vcodec, acodec = 'none', va_codecs[0] + else: + vcodec = va_codecs[0] + else: + vcodec, acodec = va_codecs[:2] + f.update({ + 'acodec': acodec, + 'vcodec': vcodec, + }) if last_media is not None: f['m3u8_media'] = last_media last_media = None From 9cdffeeb3fbfa28407a8a519a58b3714b7cec874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 27 Feb 2016 07:01:11 +0600 Subject: [PATCH 035/508] [extractor/common] Clarify rationale on media playlist detection --- youtube_dl/extractor/common.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 51351fb57..3f16b1b9e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1033,11 +1033,21 @@ class InfoExtractor(object): return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() - # A Media Playlist Tag MUST NOT appear in a Master Playlist - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 - # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists - # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 - if '#EXT-X-TARGETDURATION' in m3u8_doc: + + # We should try extracting formats only from master playlists [1], i.e. + # playlists that describe available qualities. On the other hand media + # playlists [2] should be returned as is since they contain just the media + # without qualities renditions. + # Fortunately, master playlist can be easily distinguished from media + # playlist based on particular tags availability. As of [1, 2] master + # playlist tags MUST NOT appear in a media playist and vice versa. + # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist + # and MUST NOT appear in master playlist thus we can clearly detect media + # playlist with this criterion. + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is return [{ 'url': m3u8_url, 'format_id': m3u8_id, From f7f2e53a0a25e8c99f6f1a50cb3843d8e6f5be6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 27 Feb 2016 15:51:25 +0600 Subject: [PATCH 036/508] [imdb] Recognize 1080p formats (Closes #8677) --- youtube_dl/extractor/imdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 02e1e428e..b61b2dc4e 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -42,7 +42,7 @@ class ImdbIE(InfoExtractor): for f_url, f_name in extra_formats] format_pages.append(player_page) - quality = qualities(['SD', '480p', '720p']) + quality = qualities(('SD', '480p', '720p', '1080p')) formats = [] for format_page in format_pages: json_data = self._search_regex( From d6e9c2706fd803b5963fba47b168b6bccc063667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 27 Feb 2016 16:58:11 +0600 Subject: [PATCH 037/508] [tnaflixnetwork:embed] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tnaflix.py | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c30679736..5817140c0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -737,6 +737,7 @@ from .tmz import ( TMZArticleIE, ) from .tnaflix import ( + TNAFlixNetworkEmbedIE, TNAFlixIE, EMPFlixIE, MovieFapIE, diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 49516abca..547d83d09 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -71,7 +71,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id webpage = self._download_webpage(url, display_id) @@ -152,6 +152,30 @@ class TNAFlixNetworkBaseIE(InfoExtractor): } +class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' + + _TITLE_REGEX = r'<title>([^<]+)' + + _TESTS = [{ + 'url': 'https://player.tnaflix.com/video/6538', + 'info_dict': { + 'id': '6538', + 'display_id': '6538', + 'ext': 'mp4', + 'title': 'Educational xxx video', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://player.empflix.com/video/33051', + 'only_matching': True, + }] + + class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' From 8fab62482abf8e75f2dd98fedbe95d81ce767382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Feb 2016 16:59:10 +0600 Subject: [PATCH 038/508] [tnaflixnetwork] Fallback age limit to 18 --- youtube_dl/extractor/tnaflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 547d83d09..e8c272704 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -117,7 +117,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): title = self._html_search_regex( self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) - age_limit = self._rta_search(webpage) + age_limit = self._rta_search(webpage) or 18 duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration', default=None)) From 63719a8ac3054f702238632495e291dd6f81e6b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Feb 2016 17:15:06 +0600 Subject: [PATCH 039/508] [tnaflixnetwork:embed] Add _extract_urls --- youtube_dl/extractor/tnaflix.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index e8c272704..79f036fe4 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -175,6 +175,12 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r']+?src=(["\'])(?P(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', + webpage)] + class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' From 2c9ca78281f84abe194bdf23a5e06b747961c9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Feb 2016 17:15:49 +0600 Subject: [PATCH 040/508] [extractor/generic] Add support for tnaflix network embeds (Closes #7505) --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c6bf8d270..ca745ae41 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,6 +47,7 @@ from .senateisvp import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE +from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE @@ -1633,6 +1634,11 @@ class GenericIE(InfoExtractor): if xhamster_urls: return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + # Look for embedded TNAFlixNetwork player + tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) + if tnaflix_urls: + return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) From 950505370416e79073acdeaa73a1023950363266 Mon Sep 17 00:00:00 2001 From: Aidan Rowe Date: Mon, 8 Feb 2016 20:57:07 +1000 Subject: [PATCH 041/508] [dplay] add support for it.dplay.com and dplay.dk --- youtube_dl/extractor/dplay.py | 84 +++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 6cda56a7f..adcfc1f0e 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -5,40 +5,83 @@ import time from .common import InfoExtractor from ..utils import int_or_none - +from ..compat import compat_urlparse class DPlayIE(InfoExtractor): - _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'(?Phttp://(?:it|www)\.dplay\.(?:com|dk|se))/[^/]+/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', - 'info_dict': { - 'id': '3172', - 'ext': 'mp4', - 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', - 'title': 'Svensken lär sig njuta av livet', - 'duration': 2650, + _TESTS = [ + { + 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', + 'info_dict': { + 'id': '1255600', + 'ext': 'mp4', + 'display_id': 'stagione-1-episodio-25', + 'title': 'Episodio 25', + 'duration': 2761, + 'description': "Gabriele Corsi conduce un nuovo provocante e divertente dating show. 30 ragazze single hanno l'opportunità di conoscere un ragazzo e decidere se tenerlo in gioco oppure no accendendo o spegnendo le luci.", + 'season_number': 1, + 'episode_number': 25, + }, }, - } + { + 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', + 'info_dict': { + 'id': '3172', + 'ext': 'mp4', + 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', + 'title': 'Svensken lär sig njuta av livet', + 'duration': 2650, + 'description': "\"Svensken lär sig njuta av livet\". Införandet av systembolaget, industrisemestern och Skarastadgan. Med hjälp av arkivmaterial, experter och fakta ska händelserna dissekeras, analyseras och dras till sin absoluta underhållningsspets.", + 'season_number': 1, + 'episode_number': 1, + }, + }, + { + 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', + 'info_dict': { + 'id': '70816', + 'ext': 'mp4', + 'display_id': 'season-6-episode-12', + 'title': 'Episode 12', + 'duration': 2563, + 'description': " I sæsonafslutningen sker der store ting for mor og datter.\nDagen er endelig kommet for den højgravide Irina - hun skal føde! Men det bliver en lang og sej kamp for Irina, som selvfølgelig har mor Mila med som støtte hele vejen.\nMor Jette og Jessica er igen hjemme i Danmark efter deres store USA-eventyr. Og for at holde fast i den amerikanske ånd, tager pigerne i dag til gospel-undervisning. \nOg så skal Joy og mor Mia under kniven - de skal nemlig have gjort lårene mindre og ballerne større. \n ", + 'season_number': 6, + 'episode_number': 12, + }, + } + ] def _real_extract(self, url): + #this extrator works with it.dplay.com, www.dplay.se and www.dplay.dk + # so we need to determine the domain to send the requests to + domain = self._search_regex(self._VALID_URL, url, 'domain') display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex( r'data-video-id="(\d+)"', webpage, 'video id') - info = self._download_json( - 'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id, + video_url = compat_urlparse.urljoin(domain, 'api/v2/ajax/videos?video_id=') + + info = self._download_json(video_url + video_id, video_id)['data'][0] - self._set_cookie( - 'secure.dplay.se', 'dsc-geo', - '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000)) # TODO: consider adding support for 'stream_type=hds', it seems to # require setting some cookies - manifest_url = self._download_json( - 'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id, - video_id, 'Getting manifest url for hls stream')['hls'] + # get url's TLD to determine which cookie and url to use + domain_tld = domain.split('.')[-1] + if domain_tld == 'se' or domain_tld == 'dk': + self._set_cookie( + 'secure.dplay.%s' % domain_tld, 'dsc-geo', + '{"countryCode":"%s","expiry":%d}' % (domain_tld.upper(), ((time.time() + 20 * 60) * 1000))) + + manifest_url = self._download_json( + 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % (domain_tld, video_id), + video_id, 'Getting manifest url for hls stream')['hls'] + else: + #.it requires no cookies at this point + manifest_url = info['hls'] + formats = self._extract_m3u8_formats( manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native') @@ -48,4 +91,7 @@ class DPlayIE(InfoExtractor): 'title': info['title'], 'formats': formats, 'duration': int_or_none(info.get('video_metadata_length'), scale=1000), + 'description': info.get('video_metadata_longDescription'), + 'season_number': int_or_none(info.get('season')), + 'episode_number': int_or_none(info.get('episode')), } From 940b606a0743c0f23aa4313019d8af67d863f064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Feb 2016 21:30:47 +0600 Subject: [PATCH 042/508] [dplay] Improve, extract all formats and metadata (Closes #8463) --- youtube_dl/extractor/dplay.py | 172 ++++++++++++++++++++-------------- 1 file changed, 103 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index adcfc1f0e..87071c4f8 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,97 +1,131 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import json +import re import time from .common import InfoExtractor from ..utils import int_or_none -from ..compat import compat_urlparse + class DPlayIE(InfoExtractor): - _VALID_URL = r'(?Phttp://(?:it|www)\.dplay\.(?:com|dk|se))/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'http://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se))/[^/]+/(?P[^/?#]+)' - _TESTS = [ - { - 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', - 'info_dict': { - 'id': '1255600', - 'ext': 'mp4', - 'display_id': 'stagione-1-episodio-25', - 'title': 'Episodio 25', - 'duration': 2761, - 'description': "Gabriele Corsi conduce un nuovo provocante e divertente dating show. 30 ragazze single hanno l'opportunità di conoscere un ragazzo e decidere se tenerlo in gioco oppure no accendendo o spegnendo le luci.", - 'season_number': 1, - 'episode_number': 25, - }, + _TESTS = [{ + 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', + 'info_dict': { + 'id': '1255600', + 'display_id': 'stagione-1-episodio-25', + 'ext': 'mp4', + 'title': 'Episodio 25', + 'description': 'md5:cae5f40ad988811b197d2d27a53227eb', + 'duration': 2761, + 'timestamp': 1454701800, + 'upload_date': '20160205', + 'creator': 'RTIT', + 'series': 'Take me out', + 'season_number': 1, + 'episode_number': 25, + 'age_limit': 0, }, - { - 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', - 'info_dict': { - 'id': '3172', - 'ext': 'mp4', - 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', - 'title': 'Svensken lär sig njuta av livet', - 'duration': 2650, - 'description': "\"Svensken lär sig njuta av livet\". Införandet av systembolaget, industrisemestern och Skarastadgan. Med hjälp av arkivmaterial, experter och fakta ska händelserna dissekeras, analyseras och dras till sin absoluta underhållningsspets.", - 'season_number': 1, - 'episode_number': 1, - }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/', + 'info_dict': { + 'id': '3172', + 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet', + 'ext': 'flv', + 'title': 'Svensken lär sig njuta av livet', + 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8', + 'duration': 2650, + 'timestamp': 1365454320, + 'upload_date': '20130408', + 'creator': 'Kanal 5 (Home)', + 'series': 'Nugammalt - 77 händelser som format Sverige', + 'season_number': 1, + 'episode_number': 1, + 'age_limit': 0, }, - { - 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', - 'info_dict': { - 'id': '70816', - 'ext': 'mp4', - 'display_id': 'season-6-episode-12', - 'title': 'Episode 12', - 'duration': 2563, - 'description': " I sæsonafslutningen sker der store ting for mor og datter.\nDagen er endelig kommet for den højgravide Irina - hun skal føde! Men det bliver en lang og sej kamp for Irina, som selvfølgelig har mor Mila med som støtte hele vejen.\nMor Jette og Jessica er igen hjemme i Danmark efter deres store USA-eventyr. Og for at holde fast i den amerikanske ånd, tager pigerne i dag til gospel-undervisning. \nOg så skal Joy og mor Mia under kniven - de skal nemlig have gjort lårene mindre og ballerne større. \n ", - 'season_number': 6, - 'episode_number': 12, - }, - } - ] + }, { + 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/', + 'info_dict': { + 'id': '70816', + 'display_id': 'season-6-episode-12', + 'ext': 'flv', + 'title': 'Episode 12', + 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90', + 'duration': 2563, + 'timestamp': 1429696800, + 'upload_date': '20150422', + 'creator': 'Kanal 4', + 'series': 'Mig og min mor', + 'season_number': 6, + 'episode_number': 12, + 'age_limit': 0, + }, + }] def _real_extract(self, url): - #this extrator works with it.dplay.com, www.dplay.se and www.dplay.dk - # so we need to determine the domain to send the requests to - domain = self._search_regex(self._VALID_URL, url, 'domain') - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + domain = mobj.group('domain') + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( - r'data-video-id="(\d+)"', webpage, 'video id') + r'data-video-id=["\'](\d+)', webpage, 'video id') - video_url = compat_urlparse.urljoin(domain, 'api/v2/ajax/videos?video_id=') - - info = self._download_json(video_url + video_id, + info = self._download_json( + 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id), video_id)['data'][0] - # TODO: consider adding support for 'stream_type=hds', it seems to - # require setting some cookies - # get url's TLD to determine which cookie and url to use + title = info['title'] + + PROTOCOLS = ('hls', 'hds') + formats = [] + + def extract_formats(protocol, manifest_url): + if protocol == 'hls': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False)) + elif protocol == 'hds': + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0', + video_id, f4m_id=protocol, fatal=False)) + domain_tld = domain.split('.')[-1] - if domain_tld == 'se' or domain_tld == 'dk': - self._set_cookie( - 'secure.dplay.%s' % domain_tld, 'dsc-geo', - '{"countryCode":"%s","expiry":%d}' % (domain_tld.upper(), ((time.time() + 20 * 60) * 1000))) - - manifest_url = self._download_json( - 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % (domain_tld, video_id), - video_id, 'Getting manifest url for hls stream')['hls'] + if domain_tld in ('se', 'dk'): + for protocol in PROTOCOLS: + self._set_cookie( + 'secure.dplay.%s' % domain_tld, 'dsc-geo', + json.dumps({ + 'countryCode': domain_tld.upper(), + 'expiry': (time.time() + 20 * 60) * 1000, + })) + stream = self._download_json( + 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s' + % (domain_tld, video_id, protocol), video_id, + 'Downloading %s stream JSON' % protocol, fatal=False) + if stream and stream.get(protocol): + extract_formats(protocol, stream[protocol]) else: - #.it requires no cookies at this point - manifest_url = info['hls'] - - formats = self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native') + for protocol in PROTOCOLS: + if info.get(protocol): + extract_formats(protocol, info[protocol]) return { 'id': video_id, 'display_id': display_id, - 'title': info['title'], - 'formats': formats, - 'duration': int_or_none(info.get('video_metadata_length'), scale=1000), + 'title': title, 'description': info.get('video_metadata_longDescription'), + 'duration': int_or_none(info.get('video_metadata_length'), scale=1000), + 'timestamp': int_or_none(info.get('video_publish_date')), + 'creator': info.get('video_metadata_homeChannel'), + 'series': info.get('video_metadata_show'), 'season_number': int_or_none(info.get('season')), 'episode_number': int_or_none(info.get('episode')), + 'age_limit': int_or_none(info.get('minimum_age')), + 'formats': formats, } From 20afe8bd14c76af308d4fe2fce4cef05067b10c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Feb 2016 21:31:43 +0600 Subject: [PATCH 043/508] Credit @aidan- for more dplay sites support (#8463) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 0e90a3ecb..b51e23f2d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -160,3 +160,4 @@ Erwin de Haan Jens Wille Robin Houtevelts Patrick Griffis +Aidan Rowe From 5add979d918ceb2718026bdfa66f141078b09b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Feb 2016 21:42:08 +0600 Subject: [PATCH 044/508] [dplay] Add support for dplay.no --- youtube_dl/extractor/dplay.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 87071c4f8..a638c827c 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -10,7 +10,7 @@ from ..utils import int_or_none class DPlayIE(InfoExtractor): - _VALID_URL = r'http://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se))/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'http://(?Pit\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', @@ -64,6 +64,9 @@ class DPlayIE(InfoExtractor): 'episode_number': 12, 'age_limit': 0, }, + }, { + 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/', + 'only_matching': True, }] def _real_extract(self, url): From da665ddc2551147b7b38f919c9d5c719a93f9c5d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 27 Feb 2016 21:31:21 +0100 Subject: [PATCH 045/508] release 2016.02.27 --- CONTRIBUTING.md | 13 ++++++++----- docs/supportedsites.md | 6 ++++-- youtube_dl/version.py | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 39472c554..c996f03ab 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -92,7 +92,9 @@ If you want to create a build of youtube-dl yourself, you'll need ### Adding support for a new site -If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): +If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. + +After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) 2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` @@ -140,16 +142,17 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. -8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want. +8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. +9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). +10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/__init__.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 74596155c..43403233d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -77,6 +77,7 @@ - **BleacherReportCMS** - **blinkx** - **Bloomberg** + - **BokeCC** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek - **Break** @@ -560,7 +561,6 @@ - **southpark.de** - **southpark.nl** - **southparkstudios.dk** - - **Space** - **SpankBang** - **Spankwire** - **Spiegel** @@ -620,6 +620,7 @@ - **TMZ** - **TMZArticle** - **TNAFlix** + - **TNAFlixNetworkEmbed** - **toggle** - **tou.tv** - **Toypics**: Toypics user profile @@ -670,6 +671,7 @@ - **Urort**: NRK P3 Urørt - **ustream** - **ustream:channel** + - **Ustudio** - **Varzesh3** - **Vbox7** - **VeeHD** @@ -685,7 +687,7 @@ - **video.mit.edu** - **VideoDetective** - **videofy.me** - - **VideoMega** (Currently broken) + - **VideoMega** - **videomore** - **videomore:season** - **videomore:video** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7a3df6a26..e2836357f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.02.22' +__version__ = '2016.02.27' From 4587915b2aed8d2a8373614e5984fe6e5ccaeae2 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sun, 28 Feb 2016 02:56:09 +0600 Subject: [PATCH 046/508] [README.md] Make configuration file example more diverse --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d50d4b950..7d9d501d7 100644 --- a/README.md +++ b/README.md @@ -409,11 +409,14 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy: +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. + +For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your `$HOME`: ``` ---extract-audio +-x --no-mtime --proxy 127.0.0.1:3128 +-o ~/Movies/%(title)s.%(ext)s ``` You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. From a048ffc9b01a4c6a63603edbb7204479ddebbec6 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sun, 28 Feb 2016 03:04:06 +0600 Subject: [PATCH 047/508] [README.md] Clarify configuration file options syntax --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7d9d501d7..ea4ad327b 100644 --- a/README.md +++ b/README.md @@ -419,6 +419,8 @@ For example, with the following configuration file youtube-dl will always extrac -o ~/Movies/%(title)s.%(ext)s ``` +Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. + You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run. ### Authentication with `.netrc` file From 7a0e7779fe571972b72edc6aab4b8c93db4b22e8 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sun, 28 Feb 2016 03:12:13 +0600 Subject: [PATCH 048/508] [README.md] Use simple wording instead of env variable for home --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ea4ad327b..44b65da02 100644 --- a/README.md +++ b/README.md @@ -411,7 +411,7 @@ which means you can modify it, redistribute it or use it however you like. You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\\youtube-dl.conf`. -For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your `$HOME`: +For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` -x --no-mtime From 8870bb4653982a81c2ff332103499e12a825099c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Feb 2016 03:37:48 +0600 Subject: [PATCH 049/508] [webofstories] Tolerate malforder og:title (Closes #8417) --- youtube_dl/extractor/webofstories.py | 64 +++++++++++++++++----------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index 2037d9b3d..7aea47ed5 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -12,38 +12,52 @@ class WebOfStoriesIE(InfoExtractor): _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/' _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/' _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/' - _TESTS = [ - { - 'url': 'http://www.webofstories.com/play/hans.bethe/71', - 'md5': '373e4dd915f60cfe3116322642ddf364', - 'info_dict': { - 'id': '4536', - 'ext': 'mp4', - 'title': 'The temperature of the sun', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Hans Bethe talks about calculating the temperature of the sun', - 'duration': 238, - } + _TESTS = [{ + 'url': 'http://www.webofstories.com/play/hans.bethe/71', + 'md5': '373e4dd915f60cfe3116322642ddf364', + 'info_dict': { + 'id': '4536', + 'ext': 'mp4', + 'title': 'The temperature of the sun', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Hans Bethe talks about calculating the temperature of the sun', + 'duration': 238, + } + }, { + 'url': 'http://www.webofstories.com/play/55908', + 'md5': '2985a698e1fe3211022422c4b5ed962c', + 'info_dict': { + 'id': '55908', + 'ext': 'mp4', + 'title': 'The story of Gemmata obscuriglobus', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', + 'duration': 169, }, - { - 'url': 'http://www.webofstories.com/play/55908', - 'md5': '2985a698e1fe3211022422c4b5ed962c', - 'info_dict': { - 'id': '55908', - 'ext': 'mp4', - 'title': 'The story of Gemmata obscuriglobus', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', - 'duration': 169, - } + 'skip': 'notfound', + }, { + # malformed og:title meta + 'url': 'http://www.webofstories.com/play/54215?o=MS', + 'info_dict': { + 'id': '54215', + 'ext': 'mp4', + 'title': '"A Leg to Stand On"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Oliver Sacks talks about the death and resurrection of a limb', + 'duration': 97, }, - ] + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) + # Sometimes og:title meta is malformed + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'(?s)Title:\s*(.+?)<', webpage, 'title') description = self._html_search_meta('description', webpage) thumbnail = self._og_search_thumbnail(webpage) From 9173202b84ea303c83c30b7305a1dcec5ccbe9e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 28 Feb 2016 14:06:26 +0100 Subject: [PATCH 050/508] [zdf] Ignore hls manifests that use https (closes #8665) The certificates are misconfigured, you get the following error mesage: ssl.CertificateError: hostname u'zdf-hdios-none-i.zdf.de' doesn't match either of 'a248.e.akamai.net', '*.akamaihd.net', '*.akamaihd-staging.net', '*.akamaized.net', '*.akamaized-staging.net' --- youtube_dl/extractor/zdf.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c619a75e2..81c22a627 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -137,6 +137,10 @@ class ZDFIE(InfoExtractor): formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) elif ext == 'm3u8': + # the certificates are misconfigured (see + # https://github.com/rg3/youtube-dl/issues/8665) + if video_url.startswith('https://'): + continue formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) elif ext == 'f4m': From 6c10dbeae948b491b538f70481ee5b348a636067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Feb 2016 20:05:58 +0600 Subject: [PATCH 051/508] [screenwavemedia] Improve formats extraction --- youtube_dl/extractor/screenwavemedia.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 2cf210e0d..d2af26c1c 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -70,19 +70,19 @@ class ScreenwaveMediaIE(InfoExtractor): formats = [] for source in sources: - if source['type'] == 'hls': - formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4')) + file_ = source.get('file') + if not file_: + continue + if source.get('type') == 'hls': + formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4')) else: - file_ = source.get('file') - if not file_: - continue format_label = source.get('label') format_id = self._search_regex( r'_(.+?)\.[^.]+$', file_, 'format id', default=None) height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_label, 'height', default=None)) formats.append({ - 'url': source['file'], + 'url': file_, 'format_id': format_id, 'format': format_label, 'ext': source.get('type'), From 2b2dfae83ef80f86e9983ded3c4514b58b1eed60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Feb 2016 20:16:31 +0600 Subject: [PATCH 052/508] [screenwavemedia] Improve formats sorting --- youtube_dl/extractor/screenwavemedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index d2af26c1c..8e77788ae 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -88,7 +88,7 @@ class ScreenwaveMediaIE(InfoExtractor): 'ext': source.get('type'), 'height': height, }) - self._sort_formats(formats) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { 'id': video_id, From 6dae56384a3a50f832b647f798a4b4bbd770448f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Feb 2016 21:46:36 +0600 Subject: [PATCH 053/508] [screenwavemedia] Check formats' URLs --- youtube_dl/extractor/screenwavemedia.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 8e77788ae..44b0bbee6 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -76,9 +76,11 @@ class ScreenwaveMediaIE(InfoExtractor): if source.get('type') == 'hls': formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4')) else: - format_label = source.get('label') format_id = self._search_regex( r'_(.+?)\.[^.]+$', file_, 'format id', default=None) + if not self._is_valid_url(file_, video_id, format_id or 'video'): + continue + format_label = source.get('label') height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_label, 'height', default=None)) formats.append({ From 4b3cd7316cbb95100f7fc4dd03d86e0fd7674996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Feb 2016 03:28:21 +0600 Subject: [PATCH 054/508] [tf1] Improve wat id regex (Closes #8691) --- youtube_dl/extractor/tf1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e1a64e284..9ee844684 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -48,6 +48,6 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})(?:#.*?)?\1', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From d77ab8e255e593d8534bdd47e84c0cc03c4e6efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Mar 2016 01:01:33 +0600 Subject: [PATCH 055/508] Add --mark-watched feature (Closes #5054) --- youtube_dl/__init__.py | 1 + youtube_dl/extractor/common.py | 9 +++++++++ youtube_dl/extractor/youtube.py | 26 ++++++++++++++++++++++++++ youtube_dl/options.py | 4 ++++ 4 files changed, 40 insertions(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f5f064241..79b389840 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -355,6 +355,7 @@ def _real_main(argv=None): 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, 'extract_flat': opts.extract_flat, + 'mark_watched': opts.mark_watched, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, 'fixup': opts.fixup, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3f16b1b9e..a7c700099 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1620,6 +1620,15 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def mark_watched(self, *args, **kwargs): + if (self._downloader.params.get('mark_watched', False) and + (self._get_login_info()[0] is not None or + self._downloader.params.get('cookiefile') is not None)): + self._mark_watched(*args, **kwargs) + + def _mark_watched(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ec90c2111..ba339b67d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals import itertools import json import os.path +import random import re import time import traceback @@ -1046,6 +1047,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(err_msg) return {} + def _mark_watched(self, video_id, video_info): + playback_url = video_info.get('videostats_playback_base_url', [None])[0] + if not playback_url: + return + parsed_playback_url = compat_urlparse.urlparse(playback_url) + qs = compat_urlparse.parse_qs(parsed_playback_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + }) + playback_url = compat_urlparse.urlunparse( + parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + + self._download_webpage( + playback_url, video_id, 'Marking watched', + 'Unable to mark watched', fatal=False) + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1555,6 +1579,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._sort_formats(formats) + self.mark_watched(video_id, video_info) + return { 'id': video_id, 'uploader': video_uploader, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3afa8bb6f..048dee881 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -170,6 +170,10 @@ def parseOpts(overrideArguments=None): action='store_const', dest='extract_flat', const='in_playlist', default=False, help='Do not extract the videos of a playlist, only list them.') + general.add_option( + '--mark-watched', + action='store_true', dest='mark_watched', default=False, + help='Mark videos watched (YouTube only)') general.add_option( '--no-color', '--no-colors', action='store_true', dest='no_color', From 2812c24c167dfaeed62737f2b5ba2a5c7c0de97f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Mar 2016 01:24:26 +0600 Subject: [PATCH 056/508] [mdr] Fix extraction (Closes #8702) --- youtube_dl/extractor/mdr.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 425fc9e2a..2338e7f96 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -14,7 +14,7 @@ from ..utils import ( class MDRIE(InfoExtractor): IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P\d+)(?:_.+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+-?(?P\d+)(?:_.+?)?\.html' _TESTS = [{ # MDR regularly deletes its videos @@ -60,6 +60,9 @@ class MDRIE(InfoExtractor): }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', 'only_matching': True, + }, { + 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +71,8 @@ class MDRIE(InfoExtractor): webpage = self._download_webpage(url, video_id) data_url = self._search_regex( - r'dataURL\s*:\s*(["\'])(?P/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', - webpage, 'data url', group='url') + r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P\\?/.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', + webpage, 'data url', default=None, group='url').replace('\/', '/') doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) From 3e7696822064c6ec7e7d62aab98093d89eeb7cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 29 Feb 2016 20:57:26 +0100 Subject: [PATCH 057/508] [rtve.es:live] Fix extraction * Update _VALID_URL to match the current URLs * Use the m3u8 manifest since I haven't figured out how to use the rtmp stream --- youtube_dl/extractor/rtve.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 603d7bd00..8a8c5d2a0 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -10,6 +10,7 @@ from ..utils import ( ExtractorError, float_or_none, remove_end, + remove_start, sanitized_Request, std_headers, struct_unpack, @@ -178,14 +179,14 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P[a-zA-Z0-9-]+)' + _VALID_URL = r'http://www\.rtve\.es/directo/(?P[a-zA-Z0-9-]+)' _TESTS = [{ - 'url': 'http://www.rtve.es/noticias/directo-la-1/', + 'url': 'http://www.rtve.es/directo/la-1/', 'info_dict': { - 'id': 'directo-la-1', - 'ext': 'flv', - 'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'id': 'la-1', + 'ext': 'mp4', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', }, 'params': { 'skip_download': 'live stream', @@ -198,23 +199,20 @@ class RTVELiveIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - player_url = self._search_regex( - r'', webpage, 'player URL') - title = remove_end(self._og_search_title(webpage), ' en directo') + title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') + title = remove_start(title, 'Estoy viendo ') title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id + r'playerId=player([0-9]+)', webpage, 'internal video ID') + png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) + m3u8_url = _decrypt_url(png) + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') return { 'id': video_id, - 'ext': 'flv', 'title': title, - 'url': video_url, - 'app': 'rtve-live-live?ovpfv=2.1.2', - 'player_url': player_url, - 'rtmp_live': True, + 'formats': formats, + 'is_live': True, } From e781ab63dba6968f54adb612907c76f6f9896095 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 1 Mar 2016 00:05:39 +0100 Subject: [PATCH 058/508] release 2016.03.01 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e2836357f..adafd601b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.02.27' +__version__ = '2016.03.01' From 9fb556eef0c6e83594ca4a7fe4d924a6a2a43a60 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 1 Mar 2016 08:42:33 +0800 Subject: [PATCH 059/508] [iqiyi] SWF URLs are not used anymore Since automatic detection of enc_key failed Closes #8705 --- youtube_dl/extractor/iqiyi.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 2b3952210..d3bee3a19 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -498,7 +498,7 @@ class IqiyiIE(InfoExtractor): raw_data = self._download_json(api_url, video_id) return raw_data - def get_enc_key(self, swf_url, video_id): + def get_enc_key(self, video_id): # TODO: automatic key extraction # last update at 2016-01-22 for Zombie::bite enc_key = '6ab6d0280511493ba85594779759d4ed' @@ -551,11 +551,9 @@ class IqiyiIE(InfoExtractor): r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - swf_url = self._search_regex( - r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL') _uuid = uuid.uuid4().hex - enc_key = self.get_enc_key(swf_url, video_id) + enc_key = self.get_enc_key(video_id) raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) From e7998f59aa2e204ddfef2f0eea38df1075ec0191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 1 Mar 2016 22:59:11 +0600 Subject: [PATCH 060/508] [lifenews] Fix extraction and improve (Closes #2482, closes #8714) --- youtube_dl/extractor/lifenews.py | 101 +++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index f8cbca7b3..a8fd639cc 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -20,18 +20,18 @@ class LifeNewsIE(InfoExtractor): _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P
news|video)/(?P\d+)' _TESTS = [{ - 'url': 'http://lifenews.ru/news/126342', - 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a', + # single video embedded via video/source + 'url': 'http://lifenews.ru/news/98736', + 'md5': '77c95eaefaca216e32a76a343ad89d23', 'info_dict': { - 'id': '126342', + 'id': '98736', 'ext': 'mp4', - 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом', - 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', - 'thumbnail': 're:http://.*\.jpg', - 'upload_date': '20140130', + 'title': 'Мужчина нашел дома архив оборонного завода', + 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', + 'upload_date': '20120805', } }, { - # video in ' + xml_root = self._html_search_regex( - r'', - start_page, 'xml root') + PLAYER_REGEX, start_page, 'xml root') xml_name = self._html_search_regex( r'', start_page, 'xml filename') + xml_name = self._html_search_regex( + r'