From bb6f776271c318cbb9e5b30a123b469f9d060b45 Mon Sep 17 00:00:00 2001 From: Daniel Miranda Date: Sat, 17 Sep 2016 23:29:55 -0300 Subject: [PATCH] Add CLI options to filter usable extractors Add `--enable-extractors` and `--disable-extractors` options, which make it possible to restrict the set of extractors to be considered when downloading. This is useful to handle URLs that match multiple extractors (although this should be rare), or only using particular modes of some extractors (for example, only live videos for Twitch, enabling only `twitch:stream`). Both options can be specified multiple times, and each argument is interpreted as a comma-separated list of fnmatch patterns, to allow the use of wildcards. Comparisons to extractor names are case-insensitive. The order of the arguments is not relevant - matching always proceeds as follows: - Initialize the set of considered extractors to all available - If --enable-extractors is specified, remove all extractors that *don't* match those patterns from consideration - If --disable-extractors is specified, remove all extractors that *do* match those patterns from consideration - If --age-limit is specified, remove all extractors that are not suitable from consideration Therefore, disables and the age limit take precedence over enables. --- README.md | 8 +++++ test/test_execution.py | 66 ++++++++++++++++++++++++++++++++++++++++ youtube_dl/__init__.py | 69 +++++++++++++++++++++++++++++++++++++++--- youtube_dl/options.py | 8 +++++ 4 files changed, 146 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4debe15fe..c7a9f7995 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,14 @@ which means you can modify it, redistribute it or use it however you like. extractors --force-generic-extractor Force extraction to use the generic extractor + --enable-extractors EXTRACTORS Enable only the chosen extractors. Comma- + separated list of patterns, wildcards + allowed. Example: + "twitch:*,youtube:*,vimeo" + --disable-extractors EXTRACTORS Disable the chosen extractors. Comma- + separated list of patterns, wildcards + allowed. Example: + "twitch:*,youtube:*,vimeo" --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large diff --git a/test/test_execution.py b/test/test_execution.py index 620db080e..df13c8cc3 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -11,6 +11,7 @@ import subprocess sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import encodeArgument +from youtube_dl.extractor import gen_extractors, get_info_extractor rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -39,5 +40,70 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) +ALL_EXTRACTORS = [ie.IE_NAME for ie in gen_extractors() if ie._WORKING] +EXTRACTOR_CASES = { + 'unrestricted': { + 'result': ALL_EXTRACTORS + }, + 'enable_all': { + 'enable': '*', + 'result': ALL_EXTRACTORS + }, + 'disable_all': { + 'disable': '*', + 'result': [] + }, + 'enable_disable_all': { + 'enable': '*', + 'disable': '*', + 'result': [] + }, + 'enable_some': { + 'enable': 'youtube,youporn', + 'result': ['youtube', 'YouPorn'] + }, + 'enable_and_filter': { + 'enable': 'twitch:*', + 'disable': 'twitch:stream', + 'result': [ie for ie in ALL_EXTRACTORS if ie.startswith('twitch:') and ie != 'twitch:stream'] + }, + 'enable_age_restricted': { + 'enable': 'youporn', + 'age_limit': 16, + 'result': [] + } +} + +def gen_extractor_case(case): + enable = case.get('enable') + disable = case.get('disable') + age_limit = case.get('age_limit') + result = case['result'] + + def template(self): + args = [sys.executable, 'youtube_dl/__main__.py', '--list-extractors'] + if enable: + args.extend(['--enable-extractors', enable]) + if disable: + args.extend(['--disable-extractors', disable]) + if age_limit: + args.extend(['--age-limit', str(age_limit)]) + + out = subprocess.check_output(args, cwd=rootDir, stderr=_DEV_NULL).decode('utf-8') + extractors = filter(lambda e: e and 'BROKEN' not in e, out.split('\n')) + self.assertItemsEqual(extractors, result) + + return template + +class TestExtractorSelection(unittest.TestCase): + pass + +for name, case in EXTRACTOR_CASES.items(): + test_method = gen_extractor_case(case) + test_name = str('test_' + name) + test_method.__name__ = test_name + setattr(TestExtractorSelection, test_name, test_method) + del test_method + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1cf3140a0..3696751d3 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -10,7 +10,8 @@ import io import os import random import sys - +import fnmatch +from collections import OrderedDict from .options import ( parseOpts, @@ -40,7 +41,7 @@ from .update import update_self from .downloader import ( FileDownloader, ) -from .extractor import gen_extractors, list_extractors +from .extractor import gen_extractors, gen_extractor_classes from .extractor.adobepass import MSO_INFO from .YoutubeDL import YoutubeDL @@ -100,15 +101,67 @@ def _real_main(argv=None): _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] + def get_usable_extractors(enable_patterns, disable_patterns, age_limit): + # Unfortunately it's necessary to create instances of all extractors + # instead of just looking at the classes, because some of them don't + # override the ie_key() classmethod to the correct value. + + all_extractors = OrderedDict((ie.IE_NAME.lower(), ie) for ie in gen_extractors()) + extractors = OrderedDict() if enable_patterns else all_extractors + + if enable_patterns: + all_names = list(all_extractors.keys()) + for pattern in enable_patterns: + accepted_names = fnmatch.filter(all_names, pattern) + for name in accepted_names: + if name not in extractors: + if opts.verbose: + write_string('[debug] Enabling extractor %s\n' % name) + + extractors[name] = all_extractors[name] + + if disable_patterns: + for pattern in disable_patterns: + rejected_names = fnmatch.filter(extractors.keys(), pattern) + for name in rejected_names: + if opts.verbose: + write_string('[debug] Disabling extractor %s\n' % name) + + del extractors[name] + + if age_limit: + for name, extractor in extractors.items(): + if not extractor.is_suitable(age_limit): + if opts.verbose: + write_string('[debug] Extractor %s selected by filter, but ignored due to age limit\n' % name) + + del extractors[name] + + return extractors.values() + + def patterns_from_args(args): + if not args: + return + + for arg in args: + for pattern in arg.split(','): + yield pattern.lower() + + enable_extractors = list(patterns_from_args(opts.enable_extractors)) + disable_extractors = list(patterns_from_args(opts.disable_extractors)) + extractors = get_usable_extractors(enable_extractors, disable_extractors, opts.age_limit) + if opts.list_extractors: - for ie in list_extractors(opts.age_limit): + extractors.sort(key=lambda ie: ie.IE_NAME.lower()) + for ie in extractors: write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout) matchedUrls = [url for url in all_urls if ie.suitable(url)] for mu in matchedUrls: write_string(' ' + mu + '\n', out=sys.stdout) sys.exit(0) if opts.list_extractor_descriptions: - for ie in list_extractors(opts.age_limit): + extractors.sort(key=lambda ie: ie.IE_NAME.lower()) + for ie in extractors: if not ie._WORKING: continue desc = getattr(ie, 'IE_DESC', ie.IE_NAME) @@ -413,7 +466,10 @@ def _real_main(argv=None): } - with YoutubeDL(ydl_opts) as ydl: + if not extractors: + parser.error('No usable extractors selected') + + with YoutubeDL(ydl_opts, auto_init=False) as ydl: # Update version if opts.update_self: update_self(ydl.to_screen, opts.verbose, ydl._opener) @@ -422,6 +478,9 @@ def _real_main(argv=None): if opts.rm_cachedir: ydl.cache.remove() + for extractor in extractors: + ydl.add_info_extractor(extractor) + # Maybe do nothing if (len(all_urls) < 1) and (opts.load_info_filename is None): if opts.update_self or opts.rm_cachedir: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 53497fbc6..6ca7db66d 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -167,6 +167,14 @@ def parseOpts(overrideArguments=None): '--force-generic-extractor', action='store_true', dest='force_generic_extractor', default=False, help='Force extraction to use the generic extractor') + general.add_option( + '--enable-extractors', metavar='EXTRACTORS', + action='append', dest='enable_extractors', + help='Enable only the chosen extractors. Comma-separated list of patterns, wildcards allowed. Example: "twitch:*,youtube:*,vimeo"') + general.add_option( + '--disable-extractors', metavar='EXTRACTORS', + action='append', dest='disable_extractors', + help='Disable the chosen extractors. Comma-separated list of patterns, wildcards allowed. Example: "twitch:*,youtube:*,vimeo"') general.add_option( '--default-search', dest='default_search', metavar='PREFIX',