test/regress/run_regression.py

   1 #!/usr/bin/env python3
   2 """
   3 Usage:
   4
   5     run_regression.py [--enable-proof] [--with-lfsc] [--dump]
   6         [--use-skip-return-code] [wrapper] cvc4-binary
   7         [benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p]
   8
   9 Runs benchmark and checks for correct exit status and output.
  10 """
  11
  12 import argparse
  13 import difflib
  14 import os
  15 import re
  16 import shlex
  17 import subprocess
  18 import sys
  19 import threading
  20
  21 SCRUBBER = 'SCRUBBER:'
  22 ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
  23 EXPECT = 'EXPECT:'
  24 EXPECT_ERROR = 'EXPECT-ERROR:'
  25 EXIT = 'EXIT:'
  26 COMMAND_LINE = 'COMMAND-LINE:'
  27 REQUIRES = 'REQUIRES:'
  28
  29 EXIT_OK = 0
  30 EXIT_FAILURE = 1
  31 EXIT_SKIP = 77
  32
  33
  34 def run_process(args, cwd, timeout, s_input=None):
  35     """Runs a process with a timeout `timeout` in seconds. `args` are the
  36     arguments to execute, `cwd` is the working directory and `s_input` is the
  37     input to be sent to the process over stdin. Returns the output, the error
  38     output and the exit code of the process. If the process times out, the
  39     output and the error output are empty and the exit code is 124."""
  40
  41     proc = subprocess.Popen(
  42         args,
  43         cwd=cwd,
  44         stdin=subprocess.PIPE,
  45         stdout=subprocess.PIPE,
  46         stderr=subprocess.PIPE)
  47
  48     out = ''
  49     err = ''
  50     exit_status = 124
  51     try:
  52         if timeout:
  53             timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
  54             timer.start()
  55         out, err = proc.communicate(input=s_input)
  56         exit_status = proc.returncode
  57     finally:
  58         if timeout:
  59             timer.cancel()
  60
  61     return out, err, exit_status
  62
  63
  64 def get_cvc4_features(cvc4_binary):
  65     """Returns a list of features supported by the CVC4 binary `cvc4_binary`."""
  66
  67     output, _, _ = run_process([cvc4_binary, '--show-config'], None, None)
  68     if isinstance(output, bytes):
  69         output = output.decode()
  70
  71     features = []
  72     for line in output.split('\n'):
  73         tokens = [t.strip() for t in line.split(':')]
  74         if len(tokens) == 2:
  75             key, value = tokens
  76             if value == 'yes':
  77                 features.append(key)
  78
  79     return features
  80
  81
  82 def logic_supported_with_proofs(logic):
  83     assert logic is None or isinstance(logic, str)
  84     return logic in [
  85             #single theories
  86             "QF_BV",
  87             "QF_UF",
  88             "QF_A",
  89             "QF_LRA",
  90             #two theories
  91             "QF_UFBV",
  92             "QF_UFLRA",
  93             "QF_AUF",
  94             "QF_ALRA",
  95             "QF_ABV",
  96             "QF_BVLRA"
  97             #three theories
  98             "QF_AUFBV",
  99             "QF_ABVLRA",
 100             "QF_UFBVLRA",
 101             "QF_AUFLRA",
 102             #four theories
 103             "QF_AUFBVLRA"
 104             ]
 105
 106 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 107                   command_line, benchmark_dir, benchmark_filename, timeout):
 108     """Runs CVC4 on the file `benchmark_filename` in the directory
 109     `benchmark_dir` using the binary `cvc4_binary` with the command line
 110     options `command_line`. The output is scrubbed using `scrubber` and
 111     `error_scrubber` for stdout and stderr, respectively. If dump is true, the
 112     function first uses CVC4 to read in and dump the benchmark file and then
 113     uses that as input."""
 114
 115     bin_args = wrapper[:]
 116     bin_args.append(cvc4_binary)
 117
 118     output = None
 119     error = None
 120     exit_status = None
 121     if dump:
 122         dump_args = [
 123             '--preprocess-only', '--dump', 'raw-benchmark',
 124             '--output-lang=smt2', '-qq'
 125         ]
 126         dump_output, _, _ = run_process(
 127             bin_args + command_line + dump_args + [benchmark_filename],
 128             benchmark_dir, timeout)
 129         output, error, exit_status = run_process(
 130             bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
 131             timeout, dump_output)
 132     else:
 133         output, error, exit_status = run_process(
 134             bin_args + command_line + [benchmark_filename], benchmark_dir,
 135             timeout)
 136
 137     # If a scrubber command has been specified then apply it to the output.
 138     if scrubber:
 139         output, _, _ = run_process(
 140             shlex.split(scrubber), benchmark_dir, timeout, output)
 141     if error_scrubber:
 142         error, _, _ = run_process(
 143             shlex.split(error_scrubber), benchmark_dir, timeout, error)
 144
 145     # Popen in Python 3 returns a bytes object instead of a string for
 146     # stdout/stderr.
 147     if isinstance(output, bytes):
 148         output = output.decode()
 149     if isinstance(error, bytes):
 150         error = error.decode()
 151     return (output.strip(), error.strip(), exit_status)
 152
 153
 154 def run_regression(unsat_cores, proofs, dump, use_skip_return_code, wrapper,
 155                    cvc4_binary, benchmark_path, timeout):
 156     """Determines the expected output for a benchmark, runs CVC4 on it and then
 157     checks whether the output corresponds to the expected output. Optionally
 158     uses a wrapper `wrapper`, tests unsat cores (if unsat_cores is true),
 159     checks proofs (if proofs is true), or dumps a benchmark and uses that as
 160     the input (if dump is true). `use_skip_return_code` enables/disables
 161     returning 77 when a test is skipped."""
 162
 163     if not os.access(cvc4_binary, os.X_OK):
 164         sys.exit(
 165             '"{}" does not exist or is not executable'.format(cvc4_binary))
 166     if not os.path.isfile(benchmark_path):
 167         sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
 168
 169     cvc4_features = get_cvc4_features(cvc4_binary)
 170
 171     basic_command_line_args = []
 172
 173     benchmark_basename = os.path.basename(benchmark_path)
 174     benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
 175     benchmark_dir = os.path.dirname(benchmark_path)
 176     comment_char = '%'
 177     status_regex = None
 178     logic_regex = None
 179     status_to_output = lambda s: s
 180     if benchmark_ext == '.smt':
 181         status_regex = r':status\s*(sat|unsat)'
 182         comment_char = ';'
 183     elif benchmark_ext == '.smt2':
 184         status_regex = r'set-info\s*:status\s*(sat|unsat)'
 185         logic_regex = r'\(\s*set-logic\s*(.*)\)'
 186         comment_char = ';'
 187     elif benchmark_ext == '.cvc':
 188         pass
 189     elif benchmark_ext == '.p':
 190         status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
 191         status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
 192     elif benchmark_ext == '.sy':
 193         comment_char = ';'
 194         # Do not use proofs/unsat-cores with .sy files
 195         unsat_cores = False
 196         proofs = False
 197     else:
 198         sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
 199             benchmark_basename))
 200
 201     benchmark_lines = None
 202     with open(benchmark_path, 'r') as benchmark_file:
 203         benchmark_lines = benchmark_file.readlines()
 204     benchmark_content = ''.join(benchmark_lines)
 205
 206     # Extract the metadata for the benchmark.
 207     scrubber = None
 208     error_scrubber = None
 209     expected_output = ''
 210     expected_error = ''
 211     expected_exit_status = None
 212     command_lines = []
 213     requires = []
 214     logic = None
 215     for line in benchmark_lines:
 216         # Skip lines that do not start with a comment character.
 217         if line[0] != comment_char:
 218             continue
 219         line = line[1:].lstrip()
 220
 221         if line.startswith(SCRUBBER):
 222             scrubber = line[len(SCRUBBER):].strip()
 223         elif line.startswith(ERROR_SCRUBBER):
 224             error_scrubber = line[len(ERROR_SCRUBBER):].strip()
 225         elif line.startswith(EXPECT):
 226             expected_output += line[len(EXPECT):].strip() + '\n'
 227         elif line.startswith(EXPECT_ERROR):
 228             expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
 229         elif line.startswith(EXIT):
 230             expected_exit_status = int(line[len(EXIT):].strip())
 231         elif line.startswith(COMMAND_LINE):
 232             command_lines.append(line[len(COMMAND_LINE):].strip())
 233         elif line.startswith(REQUIRES):
 234             requires.append(line[len(REQUIRES):].strip())
 235     expected_output = expected_output.strip()
 236     expected_error = expected_error.strip()
 237
 238     # Expected output/expected error has not been defined in the metadata for
 239     # the benchmark. Try to extract the information from the benchmark itself.
 240     if expected_output == '' and expected_error == '':
 241         match = None
 242         if status_regex:
 243             match = re.findall(status_regex, benchmark_content)
 244
 245         if match:
 246             expected_output = status_to_output('\n'.join(match))
 247         elif expected_exit_status is None:
 248             # If there is no expected output/error and the exit status has not
 249             # been set explicitly, the benchmark is invalid.
 250             sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
 251     if expected_exit_status is None:
 252         expected_exit_status = 0
 253     if logic_regex:
 254         logic_match = re.findall(logic_regex, benchmark_content)
 255         if logic_match and len(logic_match) == 1:
 256             logic = logic_match[0]
 257
 258     if 'CVC4_REGRESSION_ARGS' in os.environ:
 259         basic_command_line_args += shlex.split(
 260             os.environ['CVC4_REGRESSION_ARGS'])
 261
 262     if not unsat_cores and ('(get-unsat-core)' in benchmark_content
 263                             or '(get-unsat-assumptions)' in benchmark_content):
 264         print(
 265             '1..0 # Skipped regression: unsat cores not supported without proof support'
 266         )
 267         return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 268
 269     for req_feature in requires:
 270         if req_feature.startswith("no-"):
 271             inv_feature = req_feature[len("no-"):]
 272             if inv_feature in cvc4_features:
 273                 print('1..0 # Skipped regression: not valid with {}'.format(
 274                     inv_feature))
 275                 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 276         elif req_feature not in cvc4_features:
 277             print('1..0 # Skipped regression: {} not supported'.format(
 278                 req_feature))
 279             return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 280
 281     if not command_lines:
 282         command_lines.append('')
 283
 284     command_line_args_configs = []
 285     for command_line in command_lines:
 286         args = shlex.split(command_line)
 287         all_args = basic_command_line_args + args
 288
 289         if not unsat_cores and ('--check-unsat-cores' in all_args):
 290             print(
 291                 '# Skipped command line options ({}): unsat cores not supported without proof support'
 292                 .format(all_args))
 293             continue
 294         if not proofs and ('--check-proofs' in all_args
 295                            or '--dump-proofs' in all_args):
 296             print(
 297                 '# Skipped command line options ({}): checking proofs not supported without LFSC support'
 298                 .format(all_args))
 299             continue
 300
 301         command_line_args_configs.append(all_args)
 302
 303         extra_command_line_args = []
 304         if benchmark_ext == '.sy' and \
 305             '--no-check-synth-sol' not in all_args and \
 306             '--sygus-rr' not in all_args and \
 307             '--check-synth-sol' not in all_args:
 308             extra_command_line_args = ['--check-synth-sol']
 309         if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
 310            '--no-check-models' not in all_args and \
 311            '--check-models' not in all_args:
 312             extra_command_line_args = ['--check-models']
 313         if proofs and re.search(r'^(unsat|valid)$', expected_output):
 314             if '--no-check-proofs' not in all_args and \
 315                '--check-proofs' not in all_args and \
 316                '--incremental' not in all_args and \
 317                '--unconstrained-simp' not in all_args and \
 318                logic_supported_with_proofs(logic) and \
 319                not cvc4_binary.endswith('pcvc4'):
 320                 extra_command_line_args = ['--check-proofs']
 321         if unsat_cores and re.search(r'^(unsat|valid)$', expected_output):
 322             if '--no-check-unsat-cores' not in all_args and \
 323                '--check-unsat-cores' not in all_args and \
 324                '--incremental' not in all_args and \
 325                '--unconstrained-simp' not in all_args and \
 326                not cvc4_binary.endswith('pcvc4'):
 327                 extra_command_line_args += ['--check-unsat-cores']
 328         if '--no-check-abducts' not in all_args and \
 329             '--check-abducts' not in all_args and \
 330             not cvc4_binary.endswith('pcvc4'):
 331             extra_command_line_args += ['--check-abducts']
 332         if extra_command_line_args:
 333             command_line_args_configs.append(all_args +
 334                                              extra_command_line_args)
 335
 336     # Run CVC4 on the benchmark with the different option sets and check
 337     # whether the exit status, stdout output, stderr output are as expected.
 338     print('1..{}'.format(len(command_line_args_configs)))
 339     print('# Starting')
 340     exit_code = EXIT_OK
 341     for command_line_args in command_line_args_configs:
 342         output, error, exit_status = run_benchmark(
 343             dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 344             command_line_args, benchmark_dir, benchmark_basename, timeout)
 345         output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE)
 346         error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE)
 347         if output != expected_output:
 348             exit_code = EXIT_FAILURE
 349             print(
 350                 'not ok - Differences between expected and actual output on stdout - Flags: {}'
 351                 .format(command_line_args))
 352             for line in difflib.context_diff(output.splitlines(),
 353                                              expected_output.splitlines()):
 354                 print(line)
 355             print()
 356             print('Error output:')
 357             print(error)
 358         elif error != expected_error:
 359             exit_code = EXIT_FAILURE
 360             print(
 361                 'not ok - Differences between expected and actual output on stderr - Flags: {}'
 362                 .format(command_line_args))
 363             for line in difflib.context_diff(error.splitlines(),
 364                                              expected_error.splitlines()):
 365                 print(line)
 366         elif expected_exit_status != exit_status:
 367             exit_code = EXIT_FAILURE
 368             print(
 369                 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
 370                 format(expected_exit_status, exit_status, command_line_args))
 371             print()
 372             print('Output:')
 373             print(output)
 374             print()
 375             print('Error output:')
 376             print(error)
 377         else:
 378             print('ok - Flags: {}'.format(command_line_args))
 379
 380     return exit_code
 381
 382
 383 def main():
 384     """Parses the command line arguments and then calls the core of the
 385     script."""
 386
 387     parser = argparse.ArgumentParser(
 388         description=
 389         'Runs benchmark and checks for correct exit status and output.')
 390     parser.add_argument('--enable-proof', action='store_true')
 391     parser.add_argument('--with-lfsc', action='store_true')
 392     parser.add_argument('--dump', action='store_true')
 393     parser.add_argument('--use-skip-return-code', action='store_true')
 394     parser.add_argument('wrapper', nargs='*')
 395     parser.add_argument('cvc4_binary')
 396     parser.add_argument('benchmark')
 397     args = parser.parse_args()
 398     cvc4_binary = os.path.abspath(args.cvc4_binary)
 399
 400     wrapper = args.wrapper
 401     if os.environ.get('VALGRIND') == '1' and not wrapper:
 402         wrapper = ['libtool', '--mode=execute', 'valgrind']
 403
 404     timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
 405
 406     return run_regression(args.enable_proof, args.with_lfsc, args.dump,
 407                           args.use_skip_return_code, wrapper, cvc4_binary,
 408                           args.benchmark, timeout)
 409
 410
 411 if __name__ == "__main__":
 412     exit_code = main()
 413     sys.exit(exit_code)