From f01ff9cf9d966a758ab60e186c0811bf59e57b72 Mon Sep 17 00:00:00 2001 From: Andres Noetzli Date: Thu, 21 Oct 2021 08:34:26 -0700 Subject: [PATCH] Refactor regressions script (#7249) This makes the regression script more modular by refactoring all the different checks into separate classes, which makes it easier to add additional tests and to run only a subset of the tests. --- .github/workflows/ci.yml | 10 +- .../regress1/nl/approx-sqrt-unsat.smt2 | 2 +- test/regress/run_regression.py | 737 ++++++++++++------ 3 files changed, 485 insertions(+), 264 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b8ee51837..89c108b93 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: check-examples: true store-to-release: true exclude_regress: 3-4 - run_regression_args: --no-check-unsat-cores --no-check-proofs + run_regression_args: --tester base --tester model --tester synth --tester abduct - name: macos:production os: macos-11 @@ -25,7 +25,7 @@ jobs: check-examples: true store-to-release: true exclude_regress: 3-4 - run_regression_args: --no-check-unsat-cores --no-check-proofs + run_regression_args: --tester base --tester model --tester synth --tester abduct - name: ubuntu:production-clang os: ubuntu-18.04 @@ -34,7 +34,7 @@ jobs: cache-key: productionclang check-examples: true exclude_regress: 3-4 - run_regression_args: --no-check-unsat-cores --no-check-proofs + run_regression_args: --tester base --tester model --tester synth --tester abduct - name: ubuntu:production-dbg os: ubuntu-18.04 @@ -42,7 +42,7 @@ jobs: cache-key: dbg check-units: true exclude_regress: 3-4 - run_regression_args: --no-check-unsat-cores + run_regression_args: --tester base --tester model --tester synth --tester abduct --tester proof - name: ubuntu:production-dbg-clang os: ubuntu-latest @@ -50,7 +50,7 @@ jobs: config: production --auto-download --assertions --tracing --cln --gpl cache-key: dbgclang exclude_regress: 3-4 - run_regression_args: --no-check-proofs + run_regression_args: --tester base --tester model --tester synth --tester abduct --tester unsat-core name: ${{ matrix.name }} runs-on: ${{ matrix.os }} diff --git a/test/regress/regress1/nl/approx-sqrt-unsat.smt2 b/test/regress/regress1/nl/approx-sqrt-unsat.smt2 index 576fb1a67..cda24d098 100644 --- a/test/regress/regress1/nl/approx-sqrt-unsat.smt2 +++ b/test/regress/regress1/nl/approx-sqrt-unsat.smt2 @@ -1,4 +1,4 @@ -; COMMAND-LINE: --nl-ext-tplanes +; COMMAND-LINE: --nl-ext-tplanes --no-check-proofs ; EXPECT: unsat (set-logic QF_NRA) (set-info :status unsat) diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py index 32ba6f6c2..7b1c5a43f 100755 --- a/test/regress/run_regression.py +++ b/test/regress/run_regression.py @@ -11,36 +11,332 @@ # directory for licensing information. # ############################################################################# ## - """ Runs benchmark and checks for correct exit status and output. """ import argparse +import collections import difflib import os import re import shlex import subprocess import sys +import tempfile import threading class Color: - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - ENDC = '\033[0m' + BLUE = "\033[94m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + RED = "\033[91m" + ENDC = "\033[0m" + + +class Tester: + def __init__(self): + pass + + def applies(self, benchmark_info): + return True + + def run(self, benchmark_info): + exit_code = EXIT_OK + output, error, exit_status = run_benchmark(benchmark_info) + if exit_status == STATUS_TIMEOUT: + exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE + print("Timeout - Flags: {}".format(command_line_args)) + elif output != benchmark_info.expected_output: + exit_code = EXIT_FAILURE + print("not ok - Flags: {}".format(benchmark_info.command_line_args)) + print() + print("Standard output difference") + print("=" * 80) + print_diff(output, benchmark_info.expected_output) + print("=" * 80) + print() + print() + if error: + print("Error output") + print("=" * 80) + print_colored(Color.YELLOW, error) + print("=" * 80) + print() + elif error != benchmark_info.expected_error: + exit_code = EXIT_FAILURE + print( + "not ok - Differences between expected and actual output on stderr - Flags: {}".format( + benchmark_info.command_line_args + ) + ) + print() + print("Error output difference") + print("=" * 80) + print_diff(error, expected_error) + print("=" * 80) + print() + elif exit_status != benchmark_info.expected_exit_status: + exit_code = EXIT_FAILURE + print( + 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.format( + benchmark_info.expected_exit_status, + exit_status, + benchmark_info.command_line_args, + ) + ) + print() + print("Output:") + print("=" * 80) + print_colored(Color.BLUE, output) + print("=" * 80) + print() + print() + print("Error output:") + print("=" * 80) + print_colored(Color.YELLOW, error) + print("=" * 80) + print() + else: + print("ok - Flags: {}".format(benchmark_info.command_line_args)) + return exit_code + +################################################################################ +# The testers +# +# Testers use `Tester` as a base class and implement `applies()` and `run()` +# methods. The `applies()` method returns `True` if a tester applies to a given +# benchmark and `run()` runs the actual test. Most testers can invoke the +# `run()` method in the base class, which calls the cvc5 binary with a set of +# arguments and checks the expected output (both stdout and stderr) as well as +# the exit status. +# +# To add a new tester, add a class and add it to the `g_tester` dictionary. +################################################################################ + + +class BaseTester(Tester): + def __init__(self): + pass + + def run(self, benchmark_info): + return super().run(benchmark_info) + + +class UnsatCoreTester(Tester): + def __init__(self): + pass + + def applies(self, benchmark_info): + return ( + benchmark_info.benchmark_ext != ".sy" + and ( + "unsat" in benchmark_info.expected_output.split() + or "entailed" in benchmark_info.expected_output.split() + ) + and "--no-produce-unsat-cores" not in benchmark_info.command_line_args + and "--no-check-unsat-cores" not in benchmark_info.command_line_args + and "--check-unsat-cores" not in benchmark_info.command_line_args + and "sygus-inference" not in benchmark_info.benchmark_content + and "--unconstrained-simp" not in benchmark_info.command_line_args + ) -SCRUBBER = 'SCRUBBER:' -ERROR_SCRUBBER = 'ERROR-SCRUBBER:' -EXPECT = 'EXPECT:' -EXPECT_ERROR = 'EXPECT-ERROR:' -EXIT = 'EXIT:' -COMMAND_LINE = 'COMMAND-LINE:' -REQUIRES = 'REQUIRES:' + def run(self, benchmark_info): + return super().run( + benchmark_info._replace( + command_line_args=benchmark_info.command_line_args + + ["--check-unsat-cores"] + ) + ) + + +class ProofTester(Tester): + def __init__(self): + pass + + def applies(self, benchmark_info): + expected_output_lines = benchmark_info.expected_output.split() + return ( + benchmark_info.benchmark_ext != ".sy" + and ( + "unsat" in benchmark_info.expected_output.split() + or "entailed" in benchmark_info.expected_output.split() + ) + and "--no-produce-proofs" not in benchmark_info.command_line_args + and "--no-check-proofs" not in benchmark_info.command_line_args + and "--check-proofs" not in benchmark_info.command_line_args + ) + + def run(self, benchmark_info): + return super().run( + benchmark_info._replace( + command_line_args=benchmark_info.command_line_args + ["--check-proofs"] + ) + ) + + +class ModelTester(Tester): + def __init__(self): + pass + + def applies(self, benchmark_info): + expected_output_lines = benchmark_info.expected_output.split() + return ( + benchmark_info.benchmark_ext != ".sy" + and ("sat" in expected_output_lines or "unknown" in expected_output_lines) + and "--no-debug-check-models" not in benchmark_info.command_line_args + and "--no-check-models" not in benchmark_info.command_line_args + and "--debug-check-models" not in benchmark_info.command_line_args + ) + + def run(self, benchmark_info): + return super().run( + benchmark_info._replace( + command_line_args=benchmark_info.command_line_args + + ["--debug-check-models"] + ) + ) + + +class SynthTester(Tester): + def __init__(self): + pass + + def applies(self, benchmark_info): + return ( + benchmark_info.benchmark_ext == ".sy" + and "--no-check-synth-sol" not in benchmark_info.command_line_args + and "--sygus-rr" not in benchmark_info.command_line_args + and "--check-synth-sol" not in benchmark_info.command_line_args + ) + + def run(self, benchmark_info): + return super().run( + benchmark_info._replace( + command_line_args=benchmark_info.command_line_args + + ["--check-synth-sol"] + ) + ) + + +class AbductTester(Tester): + def __init__(self): + pass + + def applies(self, benchmark_info): + return ( + benchmark_info.benchmark_ext != ".sy" + and "--no-check-abducts" not in benchmark_info.command_line_args + and "--check-abducts" not in benchmark_info.command_line_args + and "get-abduct" in benchmark_info.benchmark_content + ) + + def run(self, benchmark_info): + return super().run( + benchmark_info._replace( + command_line_args=benchmark_info.command_line_args + ["--check-abducts"] + ) + ) + + +class DumpTester(Tester): + def applies(self, benchmark_info): + return benchmark_info.expected_exit_status == EXIT_OK + + def run(self, benchmark_info): + ext_to_lang = { + ".smt2": "smt2", + ".p": "tptp", + ".sy": "sygus", + } + + tmpf_name = None + with tempfile.NamedTemporaryFile(delete=False) as tmpf: + dump_args = [ + "--parse-only", + "-o", + "raw-benchmark", + f"--output-lang={ext_to_lang[benchmark_info.benchmark_ext]}", + ] + dump_output, _, _ = run_process( + [benchmark_info.cvc5_binary] + + benchmark_info.command_line_args + + dump_args + + [benchmark_info.benchmark_basename], + benchmark_info.benchmark_dir, + benchmark_info.timeout, + ) + + tmpf_name = tmpf.name + tmpf.write(dump_output) + + if not tmpf_name: + return EXIT_FAILURE + + exit_code = super().run( + benchmark_info._replace( + command_line_args=benchmark_info.command_line_args + + [ + "--parse-only", + f"--lang={ext_to_lang[benchmark_info.benchmark_ext]}", + ], + benchmark_basename=tmpf.name, + expected_output="", + ) + ) + os.remove(tmpf.name) + return exit_code + + +g_testers = { + "base": BaseTester(), + "unsat-core": UnsatCoreTester(), + "proof": ProofTester(), + "model": ModelTester(), + "synth": SynthTester(), + "abduct": AbductTester(), + "dump": DumpTester(), +} + +g_default_testers = [ + "base", + "unsat-core", + "proof", + "model", + "synth", + "abduct", +] + +################################################################################ + +BenchmarkInfo = collections.namedtuple( + "BenchmarkInfo", + [ + "wrapper", + "scrubber", + "error_scrubber", + "timeout", + "cvc5_binary", + "benchmark_dir", + "benchmark_basename", + "benchmark_ext", + "benchmark_content", + "expected_output", + "expected_error", + "expected_exit_status", + "command_line_args", + ], +) + +SCRUBBER = "SCRUBBER:" +ERROR_SCRUBBER = "ERROR-SCRUBBER:" +EXPECT = "EXPECT:" +EXPECT_ERROR = "EXPECT-ERROR:" +EXIT = "EXIT:" +COMMAND_LINE = "COMMAND-LINE:" +REQUIRES = "REQUIRES:" EXIT_OK = 0 EXIT_FAILURE = 1 @@ -58,14 +354,12 @@ def print_colored(color, text): def print_diff(actual, expected): """Prints the difference between `actual` and `expected`.""" - for line in difflib.unified_diff(expected.splitlines(), - actual.splitlines(), - 'expected', - 'actual', - lineterm=''): - if line.startswith('+'): + for line in difflib.unified_diff( + expected.splitlines(), actual.splitlines(), "expected", "actual", lineterm="" + ): + if line.startswith("+"): print_colored(Color.GREEN, line) - elif line.startswith('-'): + elif line.startswith("-"): print_colored(Color.RED, line) else: print(line) @@ -78,14 +372,16 @@ def run_process(args, cwd, timeout, s_input=None): output and the exit code of the process. If the process times out, the output and the error output are empty and the exit code is 124.""" - proc = subprocess.Popen(args, - cwd=cwd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + proc = subprocess.Popen( + args, + cwd=cwd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) - out = '' - err = '' + out = "" + err = "" exit_status = STATUS_TIMEOUT try: if timeout: @@ -106,61 +402,60 @@ def run_process(args, cwd, timeout, s_input=None): def get_cvc5_features(cvc5_binary): """Returns a list of features supported by the cvc5 binary `cvc5_binary`.""" - output, _, _ = run_process([cvc5_binary, '--show-config'], None, None) + output, _, _ = run_process([cvc5_binary, "--show-config"], None, None) if isinstance(output, bytes): output = output.decode() features = [] disabled_features = [] - for line in output.split('\n'): - tokens = [t.strip() for t in line.split(':')] + for line in output.split("\n"): + tokens = [t.strip() for t in line.split(":")] if len(tokens) == 2: key, value = tokens - if value == 'yes': + if value == "yes": features.append(key) - elif value == 'no': + elif value == "no": disabled_features.append(key) return features, disabled_features -def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc5_binary, - command_line, benchmark_dir, benchmark_filename, timeout): - """Runs cvc5 on the file `benchmark_filename` in the directory - `benchmark_dir` using the binary `cvc5_binary` with the command line - options `command_line`. The output is scrubbed using `scrubber` and - `error_scrubber` for stdout and stderr, respectively. If dump is true, the - function first uses cvc5 to read in and dump the benchmark file and then - uses that as input.""" +def run_benchmark(benchmark_info): + """Runs cvc5 on a benchmark with the given `benchmark_info`. It runs on the + file `benchmark_basename` in the directory `benchmark_dir` using the binary + `cvc5_binary` with the command line options `command_line_args`. The output + is scrubbed using `scrubber` and `error_scrubber` for stdout and stderr, + respectively.""" - bin_args = wrapper[:] - bin_args.append(cvc5_binary) + bin_args = benchmark_info.wrapper[:] + bin_args.append(benchmark_info.cvc5_binary) output = None error = None exit_status = None - if dump: - dump_args = [ - '--parse-only', '-o', 'raw-benchmark', '--output-lang=smt2' - ] - dump_output, _, _ = run_process( - bin_args + command_line + dump_args + [benchmark_filename], - benchmark_dir, timeout) - output, error, exit_status = run_process( - bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir, - timeout, dump_output) - else: - output, error, exit_status = run_process( - bin_args + command_line + [benchmark_filename], benchmark_dir, - timeout) + output, error, exit_status = run_process( + bin_args + + benchmark_info.command_line_args + + [benchmark_info.benchmark_basename], + benchmark_info.benchmark_dir, + benchmark_info.timeout, + ) # If a scrubber command has been specified then apply it to the output. - if scrubber: - output, _, _ = run_process(shlex.split(scrubber), benchmark_dir, - timeout, output) - if error_scrubber: - error, _, _ = run_process(shlex.split(error_scrubber), benchmark_dir, - timeout, error) + if benchmark_info.scrubber: + output, _, _ = run_process( + shlex.split(benchmark_info.scrubber), + benchmark_info.benchmark_dir, + benchmark_info.timeout, + output, + ) + if benchmark_info.error_scrubber: + error, _, _ = run_process( + shlex.split(benchmark_info.error_scrubber), + benchmark_info.benchmark_dir, + benchmark_info.timeout, + error, + ) # Popen in Python 3 returns a bytes object instead of a string for # stdout/stderr. @@ -168,22 +463,28 @@ def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc5_binary, output = output.decode() if isinstance(error, bytes): error = error.decode() - return (output.strip(), error.strip(), exit_status) - - -def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code, - skip_timeout, wrapper, cvc5_binary, benchmark_path, - timeout): - """Determines the expected output for a benchmark, runs cvc5 on it and then - checks whether the output corresponds to the expected output. Optionally - uses a wrapper `wrapper`, tests unsat cores (if check_unsat_cores is true), - checks proofs (if check_proofs is true), or dumps a benchmark and uses that as - the input (if dump is true). `use_skip_return_code` enables/disables - returning 77 when a test is skipped.""" + output = re.sub(r"^[ \t]*", "", output.strip(), flags=re.MULTILINE) + error = re.sub(r"^[ \t]*", "", error.strip(), flags=re.MULTILINE) + return (output, error, exit_status) + + +def run_regression( + testers, + use_skip_return_code, + skip_timeout, + wrapper, + cvc5_binary, + benchmark_path, + timeout, +): + """Determines the expected output for a benchmark, runs cvc5 on it using + all the specified `testers` and then checks whether the output corresponds + to the expected output. Optionally uses a wrapper `wrapper`. + `use_skip_return_code` enables/disables returning 77 when a test is + skipped.""" if not os.access(cvc5_binary, os.X_OK): - sys.exit( - '"{}" does not exist or is not executable'.format(cvc5_binary)) + sys.exit('"{}" does not exist or is not executable'.format(cvc5_binary)) if not os.path.isfile(benchmark_path): sys.exit('"{}" does not exist or is not a file'.format(benchmark_path)) @@ -194,38 +495,34 @@ def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code, benchmark_basename = os.path.basename(benchmark_path) benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename) benchmark_dir = os.path.dirname(benchmark_path) - comment_char = '%' + comment_char = "%" status_regex = None status_to_output = lambda s: s - if benchmark_ext == '.smt': - status_regex = r':status\s*(sat|unsat)' - comment_char = ';' - elif benchmark_ext == '.smt2': - status_regex = r'set-info\s*:status\s*(sat|unsat)' - comment_char = ';' - elif benchmark_ext == '.p': - status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)' - status_to_output = lambda s: '% SZS status {} for {}'.format( - s, benchmark_filename) - elif benchmark_ext == '.sy': - comment_char = ';' - # Do not check proofs/unsat-cores with .sy files - check_unsat_cores = False - check_proofs = False + if benchmark_ext == ".smt2": + status_regex = r"set-info\s*:status\s*(sat|unsat)" + comment_char = ";" + elif benchmark_ext == ".p": + status_regex = ( + r"% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)" + ) + status_to_output = lambda s: "% SZS status {} for {}".format( + s, benchmark_filename + ) + elif benchmark_ext == ".sy": + comment_char = ";" else: - sys.exit('"{}" must be *.smt2 or *.p or *.sy'.format( - benchmark_basename)) + sys.exit('"{}" must be *.smt2 or *.p or *.sy'.format(benchmark_basename)) benchmark_lines = None - with open(benchmark_path, 'r') as benchmark_file: + with open(benchmark_path, "r") as benchmark_file: benchmark_lines = benchmark_file.readlines() - benchmark_content = ''.join(benchmark_lines) + benchmark_content = "".join(benchmark_lines) # Extract the metadata for the benchmark. scrubber = None error_scrubber = None - expected_output = '' - expected_error = '' + expected_output = "" + expected_error = "" expected_exit_status = None command_lines = [] requires = [] @@ -236,31 +533,31 @@ def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code, line = line[1:].lstrip() if line.startswith(SCRUBBER): - scrubber = line[len(SCRUBBER):].strip() + scrubber = line[len(SCRUBBER) :].strip() elif line.startswith(ERROR_SCRUBBER): - error_scrubber = line[len(ERROR_SCRUBBER):].strip() + error_scrubber = line[len(ERROR_SCRUBBER) :].strip() elif line.startswith(EXPECT): - expected_output += line[len(EXPECT):].strip() + '\n' + expected_output += line[len(EXPECT) :].strip() + "\n" elif line.startswith(EXPECT_ERROR): - expected_error += line[len(EXPECT_ERROR):].strip() + '\n' + expected_error += line[len(EXPECT_ERROR) :].strip() + "\n" elif line.startswith(EXIT): - expected_exit_status = int(line[len(EXIT):].strip()) + expected_exit_status = int(line[len(EXIT) :].strip()) elif line.startswith(COMMAND_LINE): - command_lines.append(line[len(COMMAND_LINE):].strip()) + command_lines.append(line[len(COMMAND_LINE) :].strip()) elif line.startswith(REQUIRES): - requires.append(line[len(REQUIRES):].strip()) + requires.append(line[len(REQUIRES) :].strip()) expected_output = expected_output.strip() expected_error = expected_error.strip() # Expected output/expected error has not been defined in the metadata for # the benchmark. Try to extract the information from the benchmark itself. - if expected_output == '' and expected_error == '': + if expected_output == "" and expected_error == "": match = None if status_regex: match = re.findall(status_regex, benchmark_content) if match: - expected_output = status_to_output('\n'.join(match)) + expected_output = status_to_output("\n".join(match)) elif expected_exit_status is None: # If there is no expected output/error and the exit status has not # been set explicitly, the benchmark is invalid. @@ -268,156 +565,77 @@ def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code, if expected_exit_status is None: expected_exit_status = 0 - if 'CVC5_REGRESSION_ARGS' in os.environ: - basic_command_line_args += shlex.split( - os.environ['CVC5_REGRESSION_ARGS']) - - if not check_unsat_cores and ('(get-unsat-core)' in benchmark_content - or '(get-unsat-assumptions)' in benchmark_content): - print( - '1..0 # Skipped regression: unsat cores not supported without proof support' - ) - return (EXIT_SKIP if use_skip_return_code else EXIT_OK) + if "CVC5_REGRESSION_ARGS" in os.environ: + basic_command_line_args += shlex.split(os.environ["CVC5_REGRESSION_ARGS"]) for req_feature in requires: is_negative = False if req_feature.startswith("no-"): - req_feature = req_feature[len("no-"):] + req_feature = req_feature[len("no-") :] is_negative = True if req_feature not in (cvc5_features + cvc5_disabled_features): print( - 'Illegal requirement in regression: {}\nAllowed requirements: {}' - .format(req_feature, - ' '.join(cvc5_features + cvc5_disabled_features))) + "Illegal requirement in regression: {}\nAllowed requirements: {}".format( + req_feature, " ".join(cvc5_features + cvc5_disabled_features) + ) + ) return EXIT_FAILURE if is_negative: if req_feature in cvc5_features: - print('1..0 # Skipped regression: not valid with {}'.format( - req_feature)) - return (EXIT_SKIP if use_skip_return_code else EXIT_OK) + print( + "1..0 # Skipped regression: not valid with {}".format(req_feature) + ) + return EXIT_SKIP if use_skip_return_code else EXIT_OK elif req_feature not in cvc5_features: - print('1..0 # Skipped regression: {} not supported'.format( - req_feature)) - return (EXIT_SKIP if use_skip_return_code else EXIT_OK) + print("1..0 # Skipped regression: {} not supported".format(req_feature)) + return EXIT_SKIP if use_skip_return_code else EXIT_OK if not command_lines: - command_lines.append('') + command_lines.append("") + tests = [] + expected_output_lines = expected_output.split() command_line_args_configs = [] for command_line in command_lines: args = shlex.split(command_line) all_args = basic_command_line_args + args - if not check_unsat_cores and ('--check-unsat-cores' in all_args): - print( - '# Skipped command line options ({}): unsat cores not supported without proof support' - .format(all_args)) - continue - if not check_proofs and '--dump-proofs' in all_args: - print( - '# Skipped command line options ({}): proof production not supported' - .format(all_args)) - continue - command_line_args_configs.append(all_args) - expected_output_lines = expected_output.split() - extra_command_line_args = [] - if benchmark_ext == '.sy' and \ - '--no-check-synth-sol' not in all_args and \ - '--sygus-rr' not in all_args and \ - '--check-synth-sol' not in all_args: - all_args += ['--check-synth-sol'] - if ('sat' in expected_output_lines or \ - 'unknown' in expected_output_lines) and \ - '--no-debug-check-models' not in all_args and \ - '--no-check-models' not in all_args and \ - '--debug-check-models' not in all_args: - extra_command_line_args += ['--debug-check-models'] - if 'unsat' in expected_output_lines in expected_output_lines: - if check_unsat_cores and \ - '--no-produce-unsat-cores' not in all_args and \ - '--no-check-unsat-cores' not in all_args and \ - '--check-unsat-cores' not in all_args and \ - 'sygus-inference' not in benchmark_content and \ - '--unconstrained-simp' not in all_args: - extra_command_line_args += ['--check-unsat-cores'] - if check_proofs and \ - '--no-produce-proofs' not in all_args and \ - '--no-check-proofs' not in all_args and \ - '--check-proofs' not in all_args: - extra_command_line_args += ['--check-proofs'] - if '--no-check-abducts' not in all_args and \ - '--check-abducts' not in all_args and \ - 'get-abduct' in benchmark_content: - all_args += ['--check-abducts'] - - # Create a test case for each extra argument - for extra_arg in extra_command_line_args: - command_line_args_configs.append(all_args + [extra_arg]) - - # Run cvc5 on the benchmark with the different option sets and check - # whether the exit status, stdout output, stderr output are as expected. - print('1..{}'.format(len(command_line_args_configs))) - print('# Starting') + benchmark_info = BenchmarkInfo( + wrapper=wrapper, + scrubber=scrubber, + error_scrubber=error_scrubber, + timeout=timeout, + cvc5_binary=cvc5_binary, + benchmark_dir=benchmark_dir, + benchmark_basename=benchmark_basename, + benchmark_ext=benchmark_ext, + benchmark_content=benchmark_content, + expected_output=expected_output, + expected_error=expected_error, + expected_exit_status=expected_exit_status, + command_line_args=all_args, + ) + for tester_name, tester in g_testers.items(): + if tester_name in testers and tester.applies(benchmark_info): + tests.append((tester, benchmark_info)) + + if len(tests) == 0: + print("1..0 # Skipped regression: no tests to run") + return EXIT_SKIP if use_skip_return_code else EXIT_OK + + print("1..{}".format(len(tests))) + print("# Starting") + # Run cvc5 on the benchmark with the different testers and check whether + # the exit status, stdout output, stderr output are as expected. exit_code = EXIT_OK - for command_line_args in command_line_args_configs: - output, error, exit_status = run_benchmark(dump, wrapper, scrubber, - error_scrubber, cvc5_binary, - command_line_args, - benchmark_dir, - benchmark_basename, timeout) - output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE) - error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE) - if exit_status == STATUS_TIMEOUT: - exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE - print('Timeout - Flags: {}'.format(command_line_args)) - elif output != expected_output: - exit_code = EXIT_FAILURE - print('not ok - Flags: {}'.format(command_line_args)) - print() - print('Standard output difference') - print('=' * 80) - print_diff(output, expected_output) - print('=' * 80) - print() - print() - if error: - print('Error output') - print('=' * 80) - print_colored(Color.YELLOW, error) - print('=' * 80) - print() - elif error != expected_error: - exit_code = EXIT_FAILURE - print( - 'not ok - Differences between expected and actual output on stderr - Flags: {}' - .format(command_line_args)) - print() - print('Error output difference') - print('=' * 80) - print_diff(error, expected_error) - print('=' * 80) - print() - elif expected_exit_status != exit_status: + for tester, benchmark_info in tests: + test_exit_code = tester.run(benchmark_info) + if exit_code == EXIT_FAILURE or test_exit_code == EXIT_FAILURE: exit_code = EXIT_FAILURE - print( - 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'. - format(expected_exit_status, exit_status, command_line_args)) - print() - print('Output:') - print('=' * 80) - print_colored(Color.BLUE, output) - print('=' * 80) - print() - print() - print('Error output:') - print('=' * 80) - print_colored(Color.YELLOW, error) - print('=' * 80) - print() else: - print('ok - Flags: {}'.format(command_line_args)) + exit_code = test_exit_code return exit_code @@ -427,40 +645,43 @@ def main(): script.""" parser = argparse.ArgumentParser( - description= - 'Runs benchmark and checks for correct exit status and output.') - parser.add_argument('--dump', action='store_true') - parser.add_argument('--use-skip-return-code', action='store_true') - parser.add_argument('--skip-timeout', action='store_true') - parser.add_argument('--check-unsat-cores', action='store_true', - default=True) - parser.add_argument('--no-check-unsat-cores', dest='check_unsat_cores', - action='store_false') - parser.add_argument('--check-proofs', action='store_true', default=True) - parser.add_argument('--no-check-proofs', dest='check_proofs', - action='store_false') - parser.add_argument('wrapper', nargs='*') - parser.add_argument('cvc5_binary') - parser.add_argument('benchmark') + description="Runs benchmark and checks for correct exit status and output." + ) + parser.add_argument("--use-skip-return-code", action="store_true") + parser.add_argument("--skip-timeout", action="store_true") + parser.add_argument("--tester", choices=g_testers.keys(), action="append") + parser.add_argument("wrapper", nargs="*") + parser.add_argument("cvc5_binary") + parser.add_argument("benchmark") argv = sys.argv[1:] # Append options passed via RUN_REGRESSION_ARGS to argv - if os.environ.get('RUN_REGRESSION_ARGS'): - argv.extend(shlex.split(os.getenv('RUN_REGRESSION_ARGS'))) + if os.environ.get("RUN_REGRESSION_ARGS"): + argv.extend(shlex.split(os.getenv("RUN_REGRESSION_ARGS"))) args = parser.parse_args(argv) cvc5_binary = os.path.abspath(args.cvc5_binary) wrapper = args.wrapper - if os.environ.get('VALGRIND') == '1' and not wrapper: - wrapper = ['libtool', '--mode=execute', 'valgrind'] - - timeout = float(os.getenv('TEST_TIMEOUT', '600')) - - return run_regression(args.check_unsat_cores, args.check_proofs, args.dump, - args.use_skip_return_code, args.skip_timeout, - wrapper, cvc5_binary, args.benchmark, timeout) + if os.environ.get("VALGRIND") == "1" and not wrapper: + wrapper = ["libtool", "--mode=execute", "valgrind"] + + timeout = float(os.getenv("TEST_TIMEOUT", "600")) + + testers = args.tester + if not testers: + testers = g_default_testers + + return run_regression( + testers, + args.use_skip_return_code, + args.skip_timeout, + wrapper, + cvc5_binary, + args.benchmark, + timeout, + ) if __name__ == "__main__": -- 2.30.2