From e235270df40320eb394cc5c602918f675e9cb544 Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Mon, 21 Mar 2016 21:36:48 +0100 Subject: [PATCH 1/2] [travis] Add new regression detection script as default test suite It runs tests and parses nosetests output to detect failures and test them for regressions against a reference version. If it finds a regression, it is automatically bisected. Unstable or flaky tests are detected and ignored automatically by running them multiple times in a row. We keep the original test suite around, but mark it as allowed to fail. It serves as a dashboard of current test statuses, but since the test can fail out of our control (this is the essence of this project), we don't want it to be blocking. Using the regression detection as a fail means that any failing build need to be examined. Even if the next build is "fixed", it does not mean that the regression has been fixed. This is a change in semantics when analyzing build history. We map/reduce by splitting the test suite in 7 parts by abusing travis' matrix feature. Reduce is done by hand, by analyzing the dashboard of travis, any failing test being critical. Because nosetests --processes just doesn't work with youtube-dl test suite (yet), we route around it by first enumerating tests. This is a bit long because nosetests needs to find and run all tests files in order to enumerate them, but it should be at most 30 seconds, while the test suite can take more than 2 hours on travis' infrastructure. By doing this, we ensure we'll be able to run the tests faster, since they are mostly I/O (network) bound due to the nature of the project. Closes #8496 --- .travis.yml | 15 ++- devscripts/regdetect.py | 238 ++++++++++++++++++++++++++++++++++++++++ devscripts/travis.sh | 7 ++ 3 files changed, 259 insertions(+), 1 deletion(-) create mode 100755 devscripts/regdetect.py create mode 100755 devscripts/travis.sh diff --git a/.travis.yml b/.travis.yml index c74c9cc12..6bcff0d00 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,20 @@ python: - "3.4" - "3.5" sudo: false -script: nosetests test --verbose +env: + - TESTS=complete + - TESTS=regression_1-of-7 + - TESTS=regression_2-of-7 + - TESTS=regression_3-of-7 + - TESTS=regression_4-of-7 + - TESTS=regression_5-of-7 + - TESTS=regression_6-of-7 + - TESTS=regression_7-of-7 +matrix: + allow_failures: + - env: TESTS=complete + +script: devscripts/travis.sh notifications: email: - filippo.valsorda@gmail.com diff --git a/devscripts/regdetect.py b/devscripts/regdetect.py new file mode 100755 index 000000000..bbbc05036 --- /dev/null +++ b/devscripts/regdetect.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 + +from __future__ import print_function, unicode_literals + +import subprocess +import sys +import os +import time + +NOSECOMMAND="nosetests" + + +def process(test): + # Parse nose output to get test statuses + # Format: test_ () ... [multi-line-error-message?] {ok,ERROR,FAIL} + if not test or len(test) == 0: + print("Received empty line") + return (None, None) + s = test.split() + if len(s) < 4: + print("Bad line passed, not enough elements:", test) + # We try to convert this nose output: + # test_opengraph (test.test_InfoExtractor.TestInfoExtractor) ... ok + # into this test handle: + # test.test_InfoExtractor:TestInfoExtractor.test_opengraph + testpath = s[1][1:-1].split('.') + if len(testpath) != 3: + print("Bad testpath passed, not enough elements:", testpath) + return (None, None) + fulltestname = "%s.%s:%s.%s"%(testpath[0], testpath[1], testpath[2], s[0]) + + status = s[-1] + if status not in ("ok", "FAIL", "ERROR"): + print("Unknown test status", status) + return (None, None) + + # we cannot assume that a test failing with a warning is ok (network error) + if status == "ok" and test.find("WARNING") != -1: + status = "WARNING" + + return (fulltestname, status) + +def fill_results(res, results): + if res[0] != None and res[1] != None: + results[res[0]] = res[1] + +def process_stream(f, verbose): + results = {} + buf = None + for line in f: + if verbose: print(line, end='') + if line.startswith("===========") or line.startswith("--------------"): + #this is the end + break + if line.startswith("test_"): #new test, process previous test + if buf != None: + fill_results(process(buf), results) # for every other element this signals the beginning of a new one + buf = line + else: + if buf and len(buf) > 0: # some tests have multi-line outputs + buf += line + if verbose: # print the end of the file + for line in f: # it might contain interesting info, like tracebacks + print(line, end='') + fill_results(process(buf), results) # process last line + return results + + +def launch_nose(args=[], verbose=True): + nose = subprocess.Popen([NOSECOMMAND, "-v"] + args, stderr=subprocess.PIPE, universal_newlines=True) + results = process_stream(nose.stderr, verbose) + nose.stderr.close() + nose.wait() + return results + +def filter_bad(results): + # Filter failing/error tests + redo = {} + for k in results.keys(): + if results[k] != "ok": + redo[k] = results[k] + return list(redo.keys()) + +def test_stability(refcommit, testcommit, failed_tests): + git_checkout(refcommit) + print("Testing at commit " + refcommit) + refresults = launch_nose(failed_tests) + stable_tests = [] + for k in refresults: + if refresults[k] != "ok": + print("Test %s is unreliable !"%(k)) + else: + stable_tests.append(k) + git_checkout(testcommit) + print("Back to commit " + testcommit) + return stable_tests + +def iterate_tests(refcommit, testcommit, testlist=[], iterations=9, cooldown=60): + failed_tests=testlist # empty means run all tests + # run tests passed in arguments (or all) and get list of failed tests + # keep running those tests a few times to make sure the failure wasn't + # temporary (bad connection, site error, ...) + for i in range(iterations): + if i > 3 and len(failed_tests) < 5: + # We have reduced the number of tests, we now test them for stability + print("We only have %d tests at iteration %d, testing for reliablity"%(len(failed_tests), i)) + failed_tests = test_stability(refcommit, testcommit, failed_tests) + if len(failed_tests) == 0: # no more stable tests + return {} + time.sleep(cooldown) + results = launch_nose(failed_tests) + failed_tests = filter_bad(results) + print("Run %d done. Has %d out of %d non-ok tests"%(i, len(failed_tests), len(results.keys()))) + if len(failed_tests) == 0: # no failure. Awesome ! + break + time.sleep(cooldown) + return results # this will return a partial result list. It does not matter since ok-tests aren't that interesting + +def git_checkout(arg): + ret = subprocess.call(["git", "checkout", "--quiet", arg]) + if ret != 0: + raise RuntimeError("git checkout failed") + +def regressive_tests(refresults, testresults): + regressive = [] + # Return list of tests that are ok in refresults but not in testresults + for k in refresults: + assert k in testresults, "New unknown test case" + if refresults[k] == "ok" and refresults[k] != testresults[k]: #let's assume FAIL == ERROR + regressive.append(k) + + return regressive + +def list_nose_tests(): + tests = sorted(launch_nose(["--collect-only"], verbose=False).keys()) + return tests + +def sub_tests(): + # See if we need to slice the work and do only one part + slice_arg = os.getenv("TESTS") + if slice_arg == None: + return None + + test_slice = slice_arg.split('_')[1] + slice_bounds = test_slice.split('-of-') + current_slice = int(slice_bounds[0]) + nr_slices = int(slice_bounds[1]) + all_tests = list_nose_tests() + length = len(all_tests) + sub_test_list = all_tests[(current_slice-1)*length // nr_slices : \ + current_slice*length // nr_slices] + + print("Running slice %d of %d; it has %d out of %d tests"%(current_slice, + nr_slices, len(sub_test_list), length)) + + return sub_test_list + +def bisect(good, bad, test): + def git_bisect(args): + ret = subprocess.call(["git", "bisect"] + args) + if ret != 0: + raise RuntimeError("git bisect failed with " + " ".join(args)) + print("Bisecting %s between %s and %s"%(test, good, bad)) + git_bisect(["start", bad, good]) + git_bisect(["run", NOSECOMMAND, "--verbose", "--detailed-errors", test]) + git_bisect(["reset"]) + print("Bisect done") + +def main(): + if len(sys.argv) < 3: + commit_range = os.getenv("TRAVIS_COMMIT_RANGE") + if commit_range != None: + commits = commit_range.split("...") + refcommit, testcommit = commits[0], commits[1] + else: + testcommit="master" + refcommit="master^" + else: + testcommit=sys.argv[1] + refcommit=sys.argv[2] + + print("Testing if commit-ish %s introduced regressions compared to %s"%(testcommit, refcommit)) + + git_checkout(testcommit) + + sub_test_list = sub_tests() + + if sub_test_list != None: + args = sub_test_list + else: + args = sys.argv[3:] # use remaining args to limit test selection (if there are any) + + results = launch_nose(args) + + failed_tests = filter_bad(results) + if len(failed_tests) == 0: + print("No failure, exiting") + sys.exit(0) + + print("%d tests are failing at %s, now testing if they are regression from %s" % + (len(failed_tests), testcommit, refcommit)) + + git_checkout(refcommit) + + results_ref = launch_nose(failed_tests) + print("Second run of %d tests done."%len(failed_tests)) + + regressive = regressive_tests(results_ref, results) + + git_checkout(testcommit) + + if len(regressive) == 0: + print("There was no detected regression") + sys.exit(0) + + + print("%d test(s) have a potential regression. Retrying them a few times to be sure"%len(regressive)) + + results_retry = iterate_tests(refcommit, testcommit, regressive) + + failed_retry = filter_bad(results_retry) + if len(failed_retry) == 0: + print("All false alarms, exiting") + sys.exit(0) + + print("We have %d regressions"%len(failed_retry)) + for k in failed_retry: + print("Test %s was %s in %s, is now %s at %s"%(k, results_ref[k], + refcommit, results_retry[k], testcommit)) + bisect(refcommit, testcommit, k) + + git_checkout(testcommit) + + sys.exit(-1) + +if __name__ == "__main__": + main() + diff --git a/devscripts/travis.sh b/devscripts/travis.sh new file mode 100755 index 000000000..3c7f1c495 --- /dev/null +++ b/devscripts/travis.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +if [ "$TESTS" = "complete" ]; then + exec nosetests test --verbose +else + exec ./devscripts/regdetect.py +fi From e31d538f39125cf32578f3d67b820bb615612503 Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Tue, 21 Jun 2016 09:16:49 +0200 Subject: [PATCH 2/2] [travis] Add self to email notifications --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 6bcff0d00..070907faf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,6 +25,7 @@ notifications: email: - filippo.valsorda@gmail.com - yasoob.khld@gmail.com + - anisse@astier.eu # irc: # channels: # - "irc.freenode.org#youtube-dl"