test/regress/run_regression.py

   1 #!/usr/bin/env python3
   2 """
   3 Usage:
   4
   5     run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary
   6         [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]
   7
   8 Runs benchmark and checks for correct exit status and output.
   9 """
  10
  11 import argparse
  12 import difflib
  13 import os
  14 import re
  15 import shlex
  16 import subprocess
  17 import sys
  18 import threading
  19
  20 SCRUBBER = 'SCRUBBER: '
  21 ERROR_SCRUBBER = 'ERROR-SCRUBBER: '
  22 EXPECT = 'EXPECT: '
  23 EXPECT_ERROR = 'EXPECT-ERROR: '
  24 EXIT = 'EXIT: '
  25 COMMAND_LINE = 'COMMAND-LINE: '
  26
  27
  28 def run_process(args, cwd, timeout, s_input=None):
  29     """Runs a process with a timeout `timeout` in seconds. `args` are the
  30     arguments to execute, `cwd` is the working directory and `s_input` is the
  31     input to be sent to the process over stdin. Returns the output, the error
  32     output and the exit code of the process. If the process times out, the
  33     output and the error output are empty and the exit code is 124."""
  34
  35     proc = subprocess.Popen(
  36         args,
  37         cwd=cwd,
  38         stdin=subprocess.PIPE,
  39         stdout=subprocess.PIPE,
  40         stderr=subprocess.PIPE)
  41
  42     out = ''
  43     err = ''
  44     exit_status = 124
  45     timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
  46     try:
  47         timer.start()
  48         out, err = proc.communicate(input=s_input)
  49         exit_status = proc.returncode
  50     finally:
  51         timer.cancel()
  52
  53     return out, err, exit_status
  54
  55
  56 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
  57                   command_line, benchmark_dir, benchmark_filename, timeout):
  58     """Runs CVC4 on the file `benchmark_filename` in the directory
  59     `benchmark_dir` using the binary `cvc4_binary` with the command line
  60     options `command_line`. The output is scrubbed using `scrubber` and
  61     `error_scrubber` for stdout and stderr, respectively. If dump is true, the
  62     function first uses CVC4 to read in and dump the benchmark file and then
  63     uses that as input."""
  64
  65     bin_args = wrapper[:]
  66     bin_args.append(cvc4_binary)
  67
  68     output = None
  69     error = None
  70     exit_status = None
  71     if dump:
  72         dump_args = [
  73             '--preprocess-only', '--dump', 'raw-benchmark',
  74             '--output-lang=smt2', '-qq'
  75         ]
  76         dump_output, _, _ = run_process(
  77             bin_args + command_line + dump_args + [benchmark_filename],
  78             benchmark_dir, timeout)
  79         output, error, exit_status = run_process(
  80             bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
  81             timeout, dump_output)
  82     else:
  83         output, error, exit_status = run_process(
  84             bin_args + command_line + [benchmark_filename], benchmark_dir,
  85             timeout)
  86
  87     # If a scrubber command has been specified then apply it to the output.
  88     if scrubber:
  89         output, _, _ = run_process(
  90             shlex.split(scrubber), benchmark_dir, timeout, output)
  91     if error_scrubber:
  92         error, _, _ = run_process(
  93             shlex.split(error_scrubber), benchmark_dir, timeout, error)
  94
  95     # Popen in Python 3 returns a bytes object instead of a string for
  96     # stdout/stderr.
  97     if isinstance(output, bytes):
  98         output = output.decode()
  99     if isinstance(error, bytes):
 100         error = error.decode()
 101     return (output.strip(), error.strip(), exit_status)
 102
 103
 104 def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path, timeout):
 105     """Determines the expected output for a benchmark, runs CVC4 on it and then
 106     checks whether the output corresponds to the expected output. Optionally
 107     uses a wrapper `wrapper`, tests proof generation (if proof is true), or
 108     dumps a benchmark and uses that as the input (if dump is true)."""
 109
 110     if not os.access(cvc4_binary, os.X_OK):
 111         sys.exit(
 112             '"{}" does not exist or is not executable'.format(cvc4_binary))
 113     if not os.path.isfile(benchmark_path):
 114         sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
 115
 116     basic_command_line_args = []
 117
 118     benchmark_basename = os.path.basename(benchmark_path)
 119     benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
 120     benchmark_dir = os.path.dirname(benchmark_path)
 121     comment_char = '%'
 122     status_regex = None
 123     status_to_output = lambda s: s
 124     if benchmark_ext == '.smt':
 125         status_regex = r':status\s*(sat|unsat)'
 126         comment_char = ';'
 127     elif benchmark_ext == '.smt2':
 128         status_regex = r'set-info\s*:status\s*(sat|unsat)'
 129         comment_char = ';'
 130     elif benchmark_ext == '.cvc':
 131         pass
 132     elif benchmark_ext == '.p':
 133         basic_command_line_args.append('--finite-model-find')
 134         status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
 135         status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
 136     elif benchmark_ext == '.sy':
 137         comment_char = ';'
 138         # Do not use proofs/unsat-cores with .sy files
 139         proof = False
 140     else:
 141         sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
 142             benchmark_basename))
 143
 144     # If there is an ".expect" file for the benchmark, read the metadata
 145     # from there, otherwise from the benchmark file.
 146     metadata_filename = benchmark_path + '.expect'
 147     if os.path.isfile(metadata_filename):
 148         comment_char = '%'
 149     else:
 150         metadata_filename = benchmark_path
 151
 152     metadata_lines = None
 153     with open(metadata_filename, 'r') as metadata_file:
 154         metadata_lines = metadata_file.readlines()
 155
 156     benchmark_content = None
 157     if metadata_filename == benchmark_path:
 158         benchmark_content = ''.join(metadata_lines)
 159     else:
 160         with open(benchmark_path, 'r') as benchmark_file:
 161             benchmark_content = benchmark_file.read()
 162
 163     # Extract the metadata for the benchmark.
 164     scrubber = None
 165     error_scrubber = None
 166     expected_output = ''
 167     expected_error = ''
 168     expected_exit_status = None
 169     command_lines = []
 170     for line in metadata_lines:
 171         # Skip lines that do not start with a comment character.
 172         if line[0] != comment_char:
 173             continue
 174         line = line[1:].lstrip()
 175
 176         if line.startswith(SCRUBBER):
 177             scrubber = line[len(SCRUBBER):]
 178         elif line.startswith(ERROR_SCRUBBER):
 179             error_scrubber = line[len(ERROR_SCRUBBER):]
 180         elif line.startswith(EXPECT):
 181             expected_output += line[len(EXPECT):]
 182         elif line.startswith(EXPECT_ERROR):
 183             expected_error += line[len(EXPECT_ERROR):]
 184         elif line.startswith(EXIT):
 185             expected_exit_status = int(line[len(EXIT):])
 186         elif line.startswith(COMMAND_LINE):
 187             command_lines.append(line[len(COMMAND_LINE):])
 188     expected_output = expected_output.strip()
 189     expected_error = expected_error.strip()
 190
 191     # Expected output/expected error has not been defined in the metadata for
 192     # the benchmark. Try to extract the information from the benchmark itself.
 193     if expected_output == '' and expected_error == '':
 194         match = None
 195         if status_regex:
 196             match = re.search(status_regex, benchmark_content)
 197
 198         if match:
 199             expected_output = status_to_output(match.group(1))
 200         elif expected_exit_status is None:
 201             # If there is no expected output/error and the exit status has not
 202             # been set explicitly, the benchmark is invalid.
 203             sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
 204     if expected_exit_status is None:
 205         expected_exit_status = 0
 206
 207     if 'CVC4_REGRESSION_ARGS' in os.environ:
 208         basic_command_line_args += shlex.split(
 209             os.environ['CVC4_REGRESSION_ARGS'])
 210
 211     if not proof and ('(get-unsat-core)' in benchmark_content
 212                       or '(get-unsat-assumptions)' in benchmark_content
 213                       or '--check-proofs' in basic_command_line_args
 214                       or '--dump-proofs' in basic_command_line_args):
 215         print(
 216             '1..0 # Skipped regression: unsat cores not supported without proof support'
 217         )
 218         return
 219
 220     if not command_lines:
 221         command_lines.append('')
 222
 223     command_line_args_configs = []
 224     for command_line in command_lines:
 225         args = shlex.split(command_line)
 226         if proof or ('--check-proofs' not in args
 227                      and '--dump-proofs' not in args):
 228             all_args = basic_command_line_args + args
 229             command_line_args_configs.append(all_args)
 230
 231             extra_command_line_args = []
 232             if benchmark_ext == '.sy' and \
 233                 '--no-check-synth-sol' not in all_args and \
 234                 '--check-synth-sol' not in all_args:
 235                 extra_command_line_args = ['--check-synth-sol']
 236             if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
 237                '--no-check-models' not in all_args:
 238                 extra_command_line_args = ['--check-models']
 239             if proof and re.search(r'^(unsat|valid)$', expected_output):
 240                 if '--no-check-proofs' not in all_args and \
 241                    '--incremental' not in all_args and \
 242                    '--unconstrained-simp' not in all_args and \
 243                    not cvc4_binary.endswith('pcvc4'):
 244                     extra_command_line_args = [
 245                         '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
 246                         '--no-bv-algebraic'
 247                     ]
 248                 if '--no-check-unsat-cores' not in all_args and \
 249                    '--incremental' not in all_args and \
 250                    '--unconstrained-simp' not in all_args and \
 251                    not cvc4_binary.endswith('pcvc4'):
 252                     extra_command_line_args += ['--check-unsat-cores']
 253             if extra_command_line_args:
 254                 command_line_args_configs.append(
 255                     all_args + extra_command_line_args)
 256
 257     # Run CVC4 on the benchmark with the different option sets and check
 258     # whether the exit status, stdout output, stderr output are as expected.
 259     print('1..{}'.format(len(command_line_args_configs)))
 260     print('# Starting')
 261     for command_line_args in command_line_args_configs:
 262         output, error, exit_status = run_benchmark(
 263             dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 264             command_line_args, benchmark_dir, benchmark_basename, timeout)
 265         if output != expected_output:
 266             print(
 267                 'not ok - Differences between expected and actual output on stdout - Flags: {}'.
 268                 format(command_line_args))
 269             for line in difflib.context_diff(output.splitlines(),
 270                                              expected_output.splitlines()):
 271                 print(line)
 272         elif error != expected_error:
 273             print(
 274                 'not ok - Differences between expected and actual output on stderr - Flags: {}'.
 275                 format(command_line_args))
 276             for line in difflib.context_diff(error.splitlines(),
 277                                              expected_error.splitlines()):
 278                 print(line)
 279         elif expected_exit_status != exit_status:
 280             print(
 281                 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
 282                 format(expected_exit_status, exit_status, command_line_args))
 283         else:
 284             print('ok - Flags: {}'.format(command_line_args))
 285
 286
 287 def main():
 288     """Parses the command line arguments and then calls the core of the
 289     script."""
 290
 291     parser = argparse.ArgumentParser(
 292         description=
 293         'Runs benchmark and checks for correct exit status and output.')
 294     parser.add_argument('--proof', action='store_true')
 295     parser.add_argument('--dump', action='store_true')
 296     parser.add_argument('wrapper', nargs='*')
 297     parser.add_argument('cvc4_binary')
 298     parser.add_argument('benchmark')
 299     args = parser.parse_args()
 300     cvc4_binary = os.path.abspath(args.cvc4_binary)
 301
 302     wrapper = args.wrapper
 303     if os.environ.get('VALGRIND') == '1' and not wrapper:
 304         wrapper = ['libtool', '--mode=execute', 'valgrind']
 305
 306     timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
 307
 308     run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark,
 309                    timeout)
 310
 311
 312 if __name__ == "__main__":
 313     main()