From f01ff9cf9d966a758ab60e186c0811bf59e57b72 Mon Sep 17 00:00:00 2001
From: Andres Noetzli <andres.noetzli@gmail.com>
Date: Thu, 21 Oct 2021 08:34:26 -0700
Subject: [PATCH] Refactor regressions script (#7249)

This makes the regression script more modular by refactoring all the
different checks into separate classes, which makes it easier to add
additional tests and to run only a subset of the tests.
---
 .github/workflows/ci.yml                      |  10 +-
 .../regress1/nl/approx-sqrt-unsat.smt2        |   2 +-
 test/regress/run_regression.py                | 737 ++++++++++++------
 3 files changed, 485 insertions(+), 264 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b8ee51837..89c108b93 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ jobs:
             check-examples: true
             store-to-release: true
             exclude_regress: 3-4
-            run_regression_args: --no-check-unsat-cores --no-check-proofs
+            run_regression_args: --tester base --tester model --tester synth --tester abduct
 
           - name: macos:production
             os: macos-11
@@ -25,7 +25,7 @@ jobs:
             check-examples: true
             store-to-release: true
             exclude_regress: 3-4
-            run_regression_args: --no-check-unsat-cores --no-check-proofs
+            run_regression_args: --tester base --tester model --tester synth --tester abduct
 
           - name: ubuntu:production-clang
             os: ubuntu-18.04
@@ -34,7 +34,7 @@ jobs:
             cache-key: productionclang
             check-examples: true
             exclude_regress: 3-4
-            run_regression_args: --no-check-unsat-cores --no-check-proofs
+            run_regression_args: --tester base --tester model --tester synth --tester abduct
 
           - name: ubuntu:production-dbg
             os: ubuntu-18.04
@@ -42,7 +42,7 @@ jobs:
             cache-key: dbg
             check-units: true
             exclude_regress: 3-4
-            run_regression_args: --no-check-unsat-cores
+            run_regression_args: --tester base --tester model --tester synth --tester abduct --tester proof
 
           - name: ubuntu:production-dbg-clang
             os: ubuntu-latest
@@ -50,7 +50,7 @@ jobs:
             config: production --auto-download --assertions --tracing --cln --gpl
             cache-key: dbgclang
             exclude_regress: 3-4
-            run_regression_args: --no-check-proofs
+            run_regression_args: --tester base --tester model --tester synth --tester abduct --tester unsat-core
 
     name: ${{ matrix.name }}
     runs-on: ${{ matrix.os }}
diff --git a/test/regress/regress1/nl/approx-sqrt-unsat.smt2 b/test/regress/regress1/nl/approx-sqrt-unsat.smt2
index 576fb1a67..cda24d098 100644
--- a/test/regress/regress1/nl/approx-sqrt-unsat.smt2
+++ b/test/regress/regress1/nl/approx-sqrt-unsat.smt2
@@ -1,4 +1,4 @@
-; COMMAND-LINE: --nl-ext-tplanes
+; COMMAND-LINE: --nl-ext-tplanes --no-check-proofs
 ; EXPECT: unsat
 (set-logic QF_NRA)
 (set-info :status unsat)
diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py
index 32ba6f6c2..7b1c5a43f 100755
--- a/test/regress/run_regression.py
+++ b/test/regress/run_regression.py
@@ -11,36 +11,332 @@
 # directory for licensing information.
 # #############################################################################
 ##
-
 """
 Runs benchmark and checks for correct exit status and output.
 """
 
 import argparse
+import collections
 import difflib
 import os
 import re
 import shlex
 import subprocess
 import sys
+import tempfile
 import threading
 
 
 class Color:
-    BLUE = '\033[94m'
-    GREEN = '\033[92m'
-    YELLOW = '\033[93m'
-    RED = '\033[91m'
-    ENDC = '\033[0m'
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    YELLOW = "\033[93m"
+    RED = "\033[91m"
+    ENDC = "\033[0m"
+
+
+class Tester:
+    def __init__(self):
+        pass
+
+    def applies(self, benchmark_info):
+        return True
+
+    def run(self, benchmark_info):
+        exit_code = EXIT_OK
+        output, error, exit_status = run_benchmark(benchmark_info)
+        if exit_status == STATUS_TIMEOUT:
+            exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE
+            print("Timeout - Flags: {}".format(command_line_args))
+        elif output != benchmark_info.expected_output:
+            exit_code = EXIT_FAILURE
+            print("not ok - Flags: {}".format(benchmark_info.command_line_args))
+            print()
+            print("Standard output difference")
+            print("=" * 80)
+            print_diff(output, benchmark_info.expected_output)
+            print("=" * 80)
+            print()
+            print()
+            if error:
+                print("Error output")
+                print("=" * 80)
+                print_colored(Color.YELLOW, error)
+                print("=" * 80)
+                print()
+        elif error != benchmark_info.expected_error:
+            exit_code = EXIT_FAILURE
+            print(
+                "not ok - Differences between expected and actual output on stderr - Flags: {}".format(
+                    benchmark_info.command_line_args
+                )
+            )
+            print()
+            print("Error output difference")
+            print("=" * 80)
+            print_diff(error, expected_error)
+            print("=" * 80)
+            print()
+        elif exit_status != benchmark_info.expected_exit_status:
+            exit_code = EXIT_FAILURE
+            print(
+                'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.format(
+                    benchmark_info.expected_exit_status,
+                    exit_status,
+                    benchmark_info.command_line_args,
+                )
+            )
+            print()
+            print("Output:")
+            print("=" * 80)
+            print_colored(Color.BLUE, output)
+            print("=" * 80)
+            print()
+            print()
+            print("Error output:")
+            print("=" * 80)
+            print_colored(Color.YELLOW, error)
+            print("=" * 80)
+            print()
+        else:
+            print("ok - Flags: {}".format(benchmark_info.command_line_args))
+        return exit_code
+
 
+################################################################################
+# The testers
+#
+# Testers use `Tester` as a base class and implement `applies()` and `run()`
+# methods. The `applies()` method returns `True` if a tester applies to a given
+# benchmark and `run()` runs the actual test. Most testers can invoke the
+# `run()` method in the base class, which calls the cvc5 binary with a set of
+# arguments and checks the expected output (both stdout and stderr) as well as
+# the exit status.
+#
+# To add a new tester, add a class and add it to the `g_tester` dictionary.
+################################################################################
+
+
+class BaseTester(Tester):
+    def __init__(self):
+        pass
+
+    def run(self, benchmark_info):
+        return super().run(benchmark_info)
+
+
+class UnsatCoreTester(Tester):
+    def __init__(self):
+        pass
+
+    def applies(self, benchmark_info):
+        return (
+            benchmark_info.benchmark_ext != ".sy"
+            and (
+                "unsat" in benchmark_info.expected_output.split()
+                or "entailed" in benchmark_info.expected_output.split()
+            )
+            and "--no-produce-unsat-cores" not in benchmark_info.command_line_args
+            and "--no-check-unsat-cores" not in benchmark_info.command_line_args
+            and "--check-unsat-cores" not in benchmark_info.command_line_args
+            and "sygus-inference" not in benchmark_info.benchmark_content
+            and "--unconstrained-simp" not in benchmark_info.command_line_args
+        )
 
-SCRUBBER = 'SCRUBBER:'
-ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
-EXPECT = 'EXPECT:'
-EXPECT_ERROR = 'EXPECT-ERROR:'
-EXIT = 'EXIT:'
-COMMAND_LINE = 'COMMAND-LINE:'
-REQUIRES = 'REQUIRES:'
+    def run(self, benchmark_info):
+        return super().run(
+            benchmark_info._replace(
+                command_line_args=benchmark_info.command_line_args
+                + ["--check-unsat-cores"]
+            )
+        )
+
+
+class ProofTester(Tester):
+    def __init__(self):
+        pass
+
+    def applies(self, benchmark_info):
+        expected_output_lines = benchmark_info.expected_output.split()
+        return (
+            benchmark_info.benchmark_ext != ".sy"
+            and (
+                "unsat" in benchmark_info.expected_output.split()
+                or "entailed" in benchmark_info.expected_output.split()
+            )
+            and "--no-produce-proofs" not in benchmark_info.command_line_args
+            and "--no-check-proofs" not in benchmark_info.command_line_args
+            and "--check-proofs" not in benchmark_info.command_line_args
+        )
+
+    def run(self, benchmark_info):
+        return super().run(
+            benchmark_info._replace(
+                command_line_args=benchmark_info.command_line_args + ["--check-proofs"]
+            )
+        )
+
+
+class ModelTester(Tester):
+    def __init__(self):
+        pass
+
+    def applies(self, benchmark_info):
+        expected_output_lines = benchmark_info.expected_output.split()
+        return (
+            benchmark_info.benchmark_ext != ".sy"
+            and ("sat" in expected_output_lines or "unknown" in expected_output_lines)
+            and "--no-debug-check-models" not in benchmark_info.command_line_args
+            and "--no-check-models" not in benchmark_info.command_line_args
+            and "--debug-check-models" not in benchmark_info.command_line_args
+        )
+
+    def run(self, benchmark_info):
+        return super().run(
+            benchmark_info._replace(
+                command_line_args=benchmark_info.command_line_args
+                + ["--debug-check-models"]
+            )
+        )
+
+
+class SynthTester(Tester):
+    def __init__(self):
+        pass
+
+    def applies(self, benchmark_info):
+        return (
+            benchmark_info.benchmark_ext == ".sy"
+            and "--no-check-synth-sol" not in benchmark_info.command_line_args
+            and "--sygus-rr" not in benchmark_info.command_line_args
+            and "--check-synth-sol" not in benchmark_info.command_line_args
+        )
+
+    def run(self, benchmark_info):
+        return super().run(
+            benchmark_info._replace(
+                command_line_args=benchmark_info.command_line_args
+                + ["--check-synth-sol"]
+            )
+        )
+
+
+class AbductTester(Tester):
+    def __init__(self):
+        pass
+
+    def applies(self, benchmark_info):
+        return (
+            benchmark_info.benchmark_ext != ".sy"
+            and "--no-check-abducts" not in benchmark_info.command_line_args
+            and "--check-abducts" not in benchmark_info.command_line_args
+            and "get-abduct" in benchmark_info.benchmark_content
+        )
+
+    def run(self, benchmark_info):
+        return super().run(
+            benchmark_info._replace(
+                command_line_args=benchmark_info.command_line_args + ["--check-abducts"]
+            )
+        )
+
+
+class DumpTester(Tester):
+    def applies(self, benchmark_info):
+        return benchmark_info.expected_exit_status == EXIT_OK
+
+    def run(self, benchmark_info):
+        ext_to_lang = {
+            ".smt2": "smt2",
+            ".p": "tptp",
+            ".sy": "sygus",
+        }
+
+        tmpf_name = None
+        with tempfile.NamedTemporaryFile(delete=False) as tmpf:
+            dump_args = [
+                "--parse-only",
+                "-o",
+                "raw-benchmark",
+                f"--output-lang={ext_to_lang[benchmark_info.benchmark_ext]}",
+            ]
+            dump_output, _, _ = run_process(
+                [benchmark_info.cvc5_binary]
+                + benchmark_info.command_line_args
+                + dump_args
+                + [benchmark_info.benchmark_basename],
+                benchmark_info.benchmark_dir,
+                benchmark_info.timeout,
+            )
+
+            tmpf_name = tmpf.name
+            tmpf.write(dump_output)
+
+        if not tmpf_name:
+            return EXIT_FAILURE
+
+        exit_code = super().run(
+            benchmark_info._replace(
+                command_line_args=benchmark_info.command_line_args
+                + [
+                    "--parse-only",
+                    f"--lang={ext_to_lang[benchmark_info.benchmark_ext]}",
+                ],
+                benchmark_basename=tmpf.name,
+                expected_output="",
+            )
+        )
+        os.remove(tmpf.name)
+        return exit_code
+
+
+g_testers = {
+    "base": BaseTester(),
+    "unsat-core": UnsatCoreTester(),
+    "proof": ProofTester(),
+    "model": ModelTester(),
+    "synth": SynthTester(),
+    "abduct": AbductTester(),
+    "dump": DumpTester(),
+}
+
+g_default_testers = [
+    "base",
+    "unsat-core",
+    "proof",
+    "model",
+    "synth",
+    "abduct",
+]
+
+################################################################################
+
+BenchmarkInfo = collections.namedtuple(
+    "BenchmarkInfo",
+    [
+        "wrapper",
+        "scrubber",
+        "error_scrubber",
+        "timeout",
+        "cvc5_binary",
+        "benchmark_dir",
+        "benchmark_basename",
+        "benchmark_ext",
+        "benchmark_content",
+        "expected_output",
+        "expected_error",
+        "expected_exit_status",
+        "command_line_args",
+    ],
+)
+
+SCRUBBER = "SCRUBBER:"
+ERROR_SCRUBBER = "ERROR-SCRUBBER:"
+EXPECT = "EXPECT:"
+EXPECT_ERROR = "EXPECT-ERROR:"
+EXIT = "EXIT:"
+COMMAND_LINE = "COMMAND-LINE:"
+REQUIRES = "REQUIRES:"
 
 EXIT_OK = 0
 EXIT_FAILURE = 1
@@ -58,14 +354,12 @@ def print_colored(color, text):
 def print_diff(actual, expected):
     """Prints the difference between `actual` and `expected`."""
 
-    for line in difflib.unified_diff(expected.splitlines(),
-                                     actual.splitlines(),
-                                     'expected',
-                                     'actual',
-                                     lineterm=''):
-        if line.startswith('+'):
+    for line in difflib.unified_diff(
+        expected.splitlines(), actual.splitlines(), "expected", "actual", lineterm=""
+    ):
+        if line.startswith("+"):
             print_colored(Color.GREEN, line)
-        elif line.startswith('-'):
+        elif line.startswith("-"):
             print_colored(Color.RED, line)
         else:
             print(line)
@@ -78,14 +372,16 @@ def run_process(args, cwd, timeout, s_input=None):
     output and the exit code of the process. If the process times out, the
     output and the error output are empty and the exit code is 124."""
 
-    proc = subprocess.Popen(args,
-                            cwd=cwd,
-                            stdin=subprocess.PIPE,
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE)
+    proc = subprocess.Popen(
+        args,
+        cwd=cwd,
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
 
-    out = ''
-    err = ''
+    out = ""
+    err = ""
     exit_status = STATUS_TIMEOUT
     try:
         if timeout:
@@ -106,61 +402,60 @@ def run_process(args, cwd, timeout, s_input=None):
 def get_cvc5_features(cvc5_binary):
     """Returns a list of features supported by the cvc5 binary `cvc5_binary`."""
 
-    output, _, _ = run_process([cvc5_binary, '--show-config'], None, None)
+    output, _, _ = run_process([cvc5_binary, "--show-config"], None, None)
     if isinstance(output, bytes):
         output = output.decode()
 
     features = []
     disabled_features = []
-    for line in output.split('\n'):
-        tokens = [t.strip() for t in line.split(':')]
+    for line in output.split("\n"):
+        tokens = [t.strip() for t in line.split(":")]
         if len(tokens) == 2:
             key, value = tokens
-            if value == 'yes':
+            if value == "yes":
                 features.append(key)
-            elif value == 'no':
+            elif value == "no":
                 disabled_features.append(key)
 
     return features, disabled_features
 
 
-def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc5_binary,
-                  command_line, benchmark_dir, benchmark_filename, timeout):
-    """Runs cvc5 on the file `benchmark_filename` in the directory
-    `benchmark_dir` using the binary `cvc5_binary` with the command line
-    options `command_line`. The output is scrubbed using `scrubber` and
-    `error_scrubber` for stdout and stderr, respectively. If dump is true, the
-    function first uses cvc5 to read in and dump the benchmark file and then
-    uses that as input."""
+def run_benchmark(benchmark_info):
+    """Runs cvc5 on a benchmark with the given `benchmark_info`. It runs on the
+    file `benchmark_basename` in the directory `benchmark_dir` using the binary
+    `cvc5_binary` with the command line options `command_line_args`. The output
+    is scrubbed using `scrubber` and `error_scrubber` for stdout and stderr,
+    respectively."""
 
-    bin_args = wrapper[:]
-    bin_args.append(cvc5_binary)
+    bin_args = benchmark_info.wrapper[:]
+    bin_args.append(benchmark_info.cvc5_binary)
 
     output = None
     error = None
     exit_status = None
-    if dump:
-        dump_args = [
-            '--parse-only', '-o', 'raw-benchmark', '--output-lang=smt2'
-        ]
-        dump_output, _, _ = run_process(
-            bin_args + command_line + dump_args + [benchmark_filename],
-            benchmark_dir, timeout)
-        output, error, exit_status = run_process(
-            bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
-            timeout, dump_output)
-    else:
-        output, error, exit_status = run_process(
-            bin_args + command_line + [benchmark_filename], benchmark_dir,
-            timeout)
+    output, error, exit_status = run_process(
+        bin_args
+        + benchmark_info.command_line_args
+        + [benchmark_info.benchmark_basename],
+        benchmark_info.benchmark_dir,
+        benchmark_info.timeout,
+    )
 
     # If a scrubber command has been specified then apply it to the output.
-    if scrubber:
-        output, _, _ = run_process(shlex.split(scrubber), benchmark_dir,
-                                   timeout, output)
-    if error_scrubber:
-        error, _, _ = run_process(shlex.split(error_scrubber), benchmark_dir,
-                                  timeout, error)
+    if benchmark_info.scrubber:
+        output, _, _ = run_process(
+            shlex.split(benchmark_info.scrubber),
+            benchmark_info.benchmark_dir,
+            benchmark_info.timeout,
+            output,
+        )
+    if benchmark_info.error_scrubber:
+        error, _, _ = run_process(
+            shlex.split(benchmark_info.error_scrubber),
+            benchmark_info.benchmark_dir,
+            benchmark_info.timeout,
+            error,
+        )
 
     # Popen in Python 3 returns a bytes object instead of a string for
     # stdout/stderr.
@@ -168,22 +463,28 @@ def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc5_binary,
         output = output.decode()
     if isinstance(error, bytes):
         error = error.decode()
-    return (output.strip(), error.strip(), exit_status)
-
-
-def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code,
-                   skip_timeout, wrapper, cvc5_binary, benchmark_path,
-                   timeout):
-    """Determines the expected output for a benchmark, runs cvc5 on it and then
-    checks whether the output corresponds to the expected output. Optionally
-    uses a wrapper `wrapper`, tests unsat cores (if check_unsat_cores is true),
-    checks proofs (if check_proofs is true), or dumps a benchmark and uses that as
-    the input (if dump is true). `use_skip_return_code` enables/disables
-    returning 77 when a test is skipped."""
+    output = re.sub(r"^[ \t]*", "", output.strip(), flags=re.MULTILINE)
+    error = re.sub(r"^[ \t]*", "", error.strip(), flags=re.MULTILINE)
+    return (output, error, exit_status)
+
+
+def run_regression(
+    testers,
+    use_skip_return_code,
+    skip_timeout,
+    wrapper,
+    cvc5_binary,
+    benchmark_path,
+    timeout,
+):
+    """Determines the expected output for a benchmark, runs cvc5 on it using
+    all the specified `testers` and then checks whether the output corresponds
+    to the expected output. Optionally uses a wrapper `wrapper`.
+    `use_skip_return_code` enables/disables returning 77 when a test is
+    skipped."""
 
     if not os.access(cvc5_binary, os.X_OK):
-        sys.exit(
-            '"{}" does not exist or is not executable'.format(cvc5_binary))
+        sys.exit('"{}" does not exist or is not executable'.format(cvc5_binary))
     if not os.path.isfile(benchmark_path):
         sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
 
@@ -194,38 +495,34 @@ def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code,
     benchmark_basename = os.path.basename(benchmark_path)
     benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
     benchmark_dir = os.path.dirname(benchmark_path)
-    comment_char = '%'
+    comment_char = "%"
     status_regex = None
     status_to_output = lambda s: s
-    if benchmark_ext == '.smt':
-        status_regex = r':status\s*(sat|unsat)'
-        comment_char = ';'
-    elif benchmark_ext == '.smt2':
-        status_regex = r'set-info\s*:status\s*(sat|unsat)'
-        comment_char = ';'
-    elif benchmark_ext == '.p':
-        status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
-        status_to_output = lambda s: '% SZS status {} for {}'.format(
-            s, benchmark_filename)
-    elif benchmark_ext == '.sy':
-        comment_char = ';'
-        # Do not check proofs/unsat-cores with .sy files
-        check_unsat_cores = False
-        check_proofs = False
+    if benchmark_ext == ".smt2":
+        status_regex = r"set-info\s*:status\s*(sat|unsat)"
+        comment_char = ";"
+    elif benchmark_ext == ".p":
+        status_regex = (
+            r"% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)"
+        )
+        status_to_output = lambda s: "% SZS status {} for {}".format(
+            s, benchmark_filename
+        )
+    elif benchmark_ext == ".sy":
+        comment_char = ";"
     else:
-        sys.exit('"{}" must be *.smt2 or *.p or *.sy'.format(
-            benchmark_basename))
+        sys.exit('"{}" must be *.smt2 or *.p or *.sy'.format(benchmark_basename))
 
     benchmark_lines = None
-    with open(benchmark_path, 'r') as benchmark_file:
+    with open(benchmark_path, "r") as benchmark_file:
         benchmark_lines = benchmark_file.readlines()
-    benchmark_content = ''.join(benchmark_lines)
+    benchmark_content = "".join(benchmark_lines)
 
     # Extract the metadata for the benchmark.
     scrubber = None
     error_scrubber = None
-    expected_output = ''
-    expected_error = ''
+    expected_output = ""
+    expected_error = ""
     expected_exit_status = None
     command_lines = []
     requires = []
@@ -236,31 +533,31 @@ def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code,
         line = line[1:].lstrip()
 
         if line.startswith(SCRUBBER):
-            scrubber = line[len(SCRUBBER):].strip()
+            scrubber = line[len(SCRUBBER) :].strip()
         elif line.startswith(ERROR_SCRUBBER):
-            error_scrubber = line[len(ERROR_SCRUBBER):].strip()
+            error_scrubber = line[len(ERROR_SCRUBBER) :].strip()
         elif line.startswith(EXPECT):
-            expected_output += line[len(EXPECT):].strip() + '\n'
+            expected_output += line[len(EXPECT) :].strip() + "\n"
         elif line.startswith(EXPECT_ERROR):
-            expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
+            expected_error += line[len(EXPECT_ERROR) :].strip() + "\n"
         elif line.startswith(EXIT):
-            expected_exit_status = int(line[len(EXIT):].strip())
+            expected_exit_status = int(line[len(EXIT) :].strip())
         elif line.startswith(COMMAND_LINE):
-            command_lines.append(line[len(COMMAND_LINE):].strip())
+            command_lines.append(line[len(COMMAND_LINE) :].strip())
         elif line.startswith(REQUIRES):
-            requires.append(line[len(REQUIRES):].strip())
+            requires.append(line[len(REQUIRES) :].strip())
     expected_output = expected_output.strip()
     expected_error = expected_error.strip()
 
     # Expected output/expected error has not been defined in the metadata for
     # the benchmark. Try to extract the information from the benchmark itself.
-    if expected_output == '' and expected_error == '':
+    if expected_output == "" and expected_error == "":
         match = None
         if status_regex:
             match = re.findall(status_regex, benchmark_content)
 
         if match:
-            expected_output = status_to_output('\n'.join(match))
+            expected_output = status_to_output("\n".join(match))
         elif expected_exit_status is None:
             # If there is no expected output/error and the exit status has not
             # been set explicitly, the benchmark is invalid.
@@ -268,156 +565,77 @@ def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code,
     if expected_exit_status is None:
         expected_exit_status = 0
 
-    if 'CVC5_REGRESSION_ARGS' in os.environ:
-        basic_command_line_args += shlex.split(
-            os.environ['CVC5_REGRESSION_ARGS'])
-
-    if not check_unsat_cores and ('(get-unsat-core)' in benchmark_content
-                            or '(get-unsat-assumptions)' in benchmark_content):
-        print(
-            '1..0 # Skipped regression: unsat cores not supported without proof support'
-        )
-        return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
+    if "CVC5_REGRESSION_ARGS" in os.environ:
+        basic_command_line_args += shlex.split(os.environ["CVC5_REGRESSION_ARGS"])
 
     for req_feature in requires:
         is_negative = False
         if req_feature.startswith("no-"):
-            req_feature = req_feature[len("no-"):]
+            req_feature = req_feature[len("no-") :]
             is_negative = True
         if req_feature not in (cvc5_features + cvc5_disabled_features):
             print(
-                'Illegal requirement in regression: {}\nAllowed requirements: {}'
-                .format(req_feature,
-                        ' '.join(cvc5_features + cvc5_disabled_features)))
+                "Illegal requirement in regression: {}\nAllowed requirements: {}".format(
+                    req_feature, " ".join(cvc5_features + cvc5_disabled_features)
+                )
+            )
             return EXIT_FAILURE
         if is_negative:
             if req_feature in cvc5_features:
-                print('1..0 # Skipped regression: not valid with {}'.format(
-                    req_feature))
-                return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
+                print(
+                    "1..0 # Skipped regression: not valid with {}".format(req_feature)
+                )
+                return EXIT_SKIP if use_skip_return_code else EXIT_OK
         elif req_feature not in cvc5_features:
-            print('1..0 # Skipped regression: {} not supported'.format(
-                req_feature))
-            return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
+            print("1..0 # Skipped regression: {} not supported".format(req_feature))
+            return EXIT_SKIP if use_skip_return_code else EXIT_OK
 
     if not command_lines:
-        command_lines.append('')
+        command_lines.append("")
 
+    tests = []
+    expected_output_lines = expected_output.split()
     command_line_args_configs = []
     for command_line in command_lines:
         args = shlex.split(command_line)
         all_args = basic_command_line_args + args
 
-        if not check_unsat_cores and ('--check-unsat-cores' in all_args):
-            print(
-                '# Skipped command line options ({}): unsat cores not supported without proof support'
-                .format(all_args))
-            continue
-        if not check_proofs and '--dump-proofs' in all_args:
-            print(
-                '# Skipped command line options ({}): proof production not supported'
-                .format(all_args))
-            continue
-
         command_line_args_configs.append(all_args)
 
-        expected_output_lines = expected_output.split()
-        extra_command_line_args = []
-        if benchmark_ext == '.sy' and \
-            '--no-check-synth-sol' not in all_args and \
-            '--sygus-rr' not in all_args and \
-            '--check-synth-sol' not in all_args:
-            all_args += ['--check-synth-sol']
-        if ('sat' in expected_output_lines or \
-            'unknown' in expected_output_lines) and \
-           '--no-debug-check-models' not in all_args and \
-           '--no-check-models' not in all_args and \
-           '--debug-check-models' not in all_args:
-            extra_command_line_args += ['--debug-check-models']
-        if 'unsat' in expected_output_lines in expected_output_lines:
-            if check_unsat_cores and \
-               '--no-produce-unsat-cores' not in all_args and \
-               '--no-check-unsat-cores' not in all_args and \
-               '--check-unsat-cores' not in all_args and \
-               'sygus-inference' not in benchmark_content and \
-               '--unconstrained-simp' not in all_args:
-                extra_command_line_args += ['--check-unsat-cores']
-            if check_proofs and \
-               '--no-produce-proofs' not in all_args and \
-               '--no-check-proofs' not in all_args and \
-               '--check-proofs' not in all_args:
-                extra_command_line_args += ['--check-proofs']
-        if '--no-check-abducts' not in all_args and \
-            '--check-abducts' not in all_args and \
-            'get-abduct' in benchmark_content:
-            all_args += ['--check-abducts']
-
-        # Create a test case for each extra argument
-        for extra_arg in extra_command_line_args:
-            command_line_args_configs.append(all_args + [extra_arg])
-
-    # Run cvc5 on the benchmark with the different option sets and check
-    # whether the exit status, stdout output, stderr output are as expected.
-    print('1..{}'.format(len(command_line_args_configs)))
-    print('# Starting')
+        benchmark_info = BenchmarkInfo(
+            wrapper=wrapper,
+            scrubber=scrubber,
+            error_scrubber=error_scrubber,
+            timeout=timeout,
+            cvc5_binary=cvc5_binary,
+            benchmark_dir=benchmark_dir,
+            benchmark_basename=benchmark_basename,
+            benchmark_ext=benchmark_ext,
+            benchmark_content=benchmark_content,
+            expected_output=expected_output,
+            expected_error=expected_error,
+            expected_exit_status=expected_exit_status,
+            command_line_args=all_args,
+        )
+        for tester_name, tester in g_testers.items():
+            if tester_name in testers and tester.applies(benchmark_info):
+                tests.append((tester, benchmark_info))
+
+    if len(tests) == 0:
+        print("1..0 # Skipped regression: no tests to run")
+        return EXIT_SKIP if use_skip_return_code else EXIT_OK
+
+    print("1..{}".format(len(tests)))
+    print("# Starting")
+    # Run cvc5 on the benchmark with the different testers and check whether
+    # the exit status, stdout output, stderr output are as expected.
     exit_code = EXIT_OK
-    for command_line_args in command_line_args_configs:
-        output, error, exit_status = run_benchmark(dump, wrapper, scrubber,
-                                                   error_scrubber, cvc5_binary,
-                                                   command_line_args,
-                                                   benchmark_dir,
-                                                   benchmark_basename, timeout)
-        output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE)
-        error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE)
-        if exit_status == STATUS_TIMEOUT:
-            exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE
-            print('Timeout - Flags: {}'.format(command_line_args))
-        elif output != expected_output:
-            exit_code = EXIT_FAILURE
-            print('not ok - Flags: {}'.format(command_line_args))
-            print()
-            print('Standard output difference')
-            print('=' * 80)
-            print_diff(output, expected_output)
-            print('=' * 80)
-            print()
-            print()
-            if error:
-                print('Error output')
-                print('=' * 80)
-                print_colored(Color.YELLOW, error)
-                print('=' * 80)
-                print()
-        elif error != expected_error:
-            exit_code = EXIT_FAILURE
-            print(
-                'not ok - Differences between expected and actual output on stderr - Flags: {}'
-                .format(command_line_args))
-            print()
-            print('Error output difference')
-            print('=' * 80)
-            print_diff(error, expected_error)
-            print('=' * 80)
-            print()
-        elif expected_exit_status != exit_status:
+    for tester, benchmark_info in tests:
+        test_exit_code = tester.run(benchmark_info)
+        if exit_code == EXIT_FAILURE or test_exit_code == EXIT_FAILURE:
             exit_code = EXIT_FAILURE
-            print(
-                'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
-                format(expected_exit_status, exit_status, command_line_args))
-            print()
-            print('Output:')
-            print('=' * 80)
-            print_colored(Color.BLUE, output)
-            print('=' * 80)
-            print()
-            print()
-            print('Error output:')
-            print('=' * 80)
-            print_colored(Color.YELLOW, error)
-            print('=' * 80)
-            print()
         else:
-            print('ok - Flags: {}'.format(command_line_args))
+            exit_code = test_exit_code
 
     return exit_code
 
@@ -427,40 +645,43 @@ def main():
     script."""
 
     parser = argparse.ArgumentParser(
-        description=
-        'Runs benchmark and checks for correct exit status and output.')
-    parser.add_argument('--dump', action='store_true')
-    parser.add_argument('--use-skip-return-code', action='store_true')
-    parser.add_argument('--skip-timeout', action='store_true')
-    parser.add_argument('--check-unsat-cores', action='store_true',
-                        default=True)
-    parser.add_argument('--no-check-unsat-cores', dest='check_unsat_cores',
-                        action='store_false')
-    parser.add_argument('--check-proofs', action='store_true', default=True)
-    parser.add_argument('--no-check-proofs', dest='check_proofs',
-                        action='store_false')
-    parser.add_argument('wrapper', nargs='*')
-    parser.add_argument('cvc5_binary')
-    parser.add_argument('benchmark')
+        description="Runs benchmark and checks for correct exit status and output."
+    )
+    parser.add_argument("--use-skip-return-code", action="store_true")
+    parser.add_argument("--skip-timeout", action="store_true")
+    parser.add_argument("--tester", choices=g_testers.keys(), action="append")
+    parser.add_argument("wrapper", nargs="*")
+    parser.add_argument("cvc5_binary")
+    parser.add_argument("benchmark")
 
     argv = sys.argv[1:]
     # Append options passed via RUN_REGRESSION_ARGS to argv
-    if os.environ.get('RUN_REGRESSION_ARGS'):
-        argv.extend(shlex.split(os.getenv('RUN_REGRESSION_ARGS')))
+    if os.environ.get("RUN_REGRESSION_ARGS"):
+        argv.extend(shlex.split(os.getenv("RUN_REGRESSION_ARGS")))
 
     args = parser.parse_args(argv)
 
     cvc5_binary = os.path.abspath(args.cvc5_binary)
 
     wrapper = args.wrapper
-    if os.environ.get('VALGRIND') == '1' and not wrapper:
-        wrapper = ['libtool', '--mode=execute', 'valgrind']
-
-    timeout = float(os.getenv('TEST_TIMEOUT', '600'))
-
-    return run_regression(args.check_unsat_cores, args.check_proofs, args.dump,
-                          args.use_skip_return_code, args.skip_timeout,
-                          wrapper, cvc5_binary, args.benchmark, timeout)
+    if os.environ.get("VALGRIND") == "1" and not wrapper:
+        wrapper = ["libtool", "--mode=execute", "valgrind"]
+
+    timeout = float(os.getenv("TEST_TIMEOUT", "600"))
+
+    testers = args.tester
+    if not testers:
+        testers = g_default_testers
+
+    return run_regression(
+        testers,
+        args.use_skip_return_code,
+        args.skip_timeout,
+        wrapper,
+        cvc5_binary,
+        args.benchmark,
+        timeout,
+    )
 
 
 if __name__ == "__main__":
-- 
2.30.2