test/regress/run_regression.py

   1 #!/usr/bin/env python3
   2 """
   3 Usage:
   4
   5     run_regression.py [--enable-proof] [--with-lfsc] [--dump]
   6         [--use-skip-return-code] [wrapper] cvc4-binary
   7         [benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p]
   8
   9 Runs benchmark and checks for correct exit status and output.
  10 """
  11
  12 import argparse
  13 import difflib
  14 import os
  15 import re
  16 import shlex
  17 import subprocess
  18 import sys
  19 import threading
  20
  21 SCRUBBER = 'SCRUBBER:'
  22 ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
  23 EXPECT = 'EXPECT:'
  24 EXPECT_ERROR = 'EXPECT-ERROR:'
  25 EXIT = 'EXIT:'
  26 COMMAND_LINE = 'COMMAND-LINE:'
  27 REQUIRES = 'REQUIRES:'
  28
  29 EXIT_OK = 0
  30 EXIT_FAILURE = 1
  31 EXIT_SKIP = 77
  32
  33
  34 def run_process(args, cwd, timeout, s_input=None):
  35     """Runs a process with a timeout `timeout` in seconds. `args` are the
  36     arguments to execute, `cwd` is the working directory and `s_input` is the
  37     input to be sent to the process over stdin. Returns the output, the error
  38     output and the exit code of the process. If the process times out, the
  39     output and the error output are empty and the exit code is 124."""
  40
  41     proc = subprocess.Popen(
  42         args,
  43         cwd=cwd,
  44         stdin=subprocess.PIPE,
  45         stdout=subprocess.PIPE,
  46         stderr=subprocess.PIPE)
  47
  48     out = ''
  49     err = ''
  50     exit_status = 124
  51     try:
  52         if timeout:
  53             timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
  54             timer.start()
  55         out, err = proc.communicate(input=s_input)
  56         exit_status = proc.returncode
  57     finally:
  58         if timeout:
  59             timer.cancel()
  60
  61     return out, err, exit_status
  62
  63
  64 def get_cvc4_features(cvc4_binary):
  65     """Returns a list of features supported by the CVC4 binary `cvc4_binary`."""
  66
  67     output, _, _ = run_process([cvc4_binary, '--show-config'], None, None)
  68     if isinstance(output, bytes):
  69         output = output.decode()
  70
  71     features = []
  72     for line in output.split('\n'):
  73         tokens = [t.strip() for t in line.split(':')]
  74         if len(tokens) == 2:
  75             key, value = tokens
  76             if value == 'yes':
  77                 features.append(key)
  78
  79     return features
  80
  81
  82 def logic_supported_with_proofs(logic):
  83     assert logic is None or isinstance(logic, str)
  84     return logic in [
  85             #single theories
  86             "QF_BV",
  87             "QF_UF",
  88             "QF_A",
  89             "QF_LRA",
  90             #two theories
  91             "QF_UFBV",
  92             "QF_UFLRA",
  93             "QF_AUF",
  94             "QF_ALRA",
  95             "QF_ABV",
  96             "QF_BVLRA"
  97             #three theories
  98             "QF_AUFBV",
  99             "QF_ABVLRA",
 100             "QF_UFBVLRA",
 101             "QF_AUFLRA",
 102             #four theories
 103             "QF_AUFBVLRA"
 104             ]
 105
 106 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 107                   command_line, benchmark_dir, benchmark_filename, timeout):
 108     """Runs CVC4 on the file `benchmark_filename` in the directory
 109     `benchmark_dir` using the binary `cvc4_binary` with the command line
 110     options `command_line`. The output is scrubbed using `scrubber` and
 111     `error_scrubber` for stdout and stderr, respectively. If dump is true, the
 112     function first uses CVC4 to read in and dump the benchmark file and then
 113     uses that as input."""
 114
 115     bin_args = wrapper[:]
 116     bin_args.append(cvc4_binary)
 117
 118     output = None
 119     error = None
 120     exit_status = None
 121     if dump:
 122         dump_args = [
 123             '--preprocess-only', '--dump', 'raw-benchmark',
 124             '--output-lang=smt2', '-qq'
 125         ]
 126         dump_output, _, _ = run_process(
 127             bin_args + command_line + dump_args + [benchmark_filename],
 128             benchmark_dir, timeout)
 129         output, error, exit_status = run_process(
 130             bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
 131             timeout, dump_output)
 132     else:
 133         output, error, exit_status = run_process(
 134             bin_args + command_line + [benchmark_filename], benchmark_dir,
 135             timeout)
 136
 137     # If a scrubber command has been specified then apply it to the output.
 138     if scrubber:
 139         output, _, _ = run_process(
 140             shlex.split(scrubber), benchmark_dir, timeout, output)
 141     if error_scrubber:
 142         error, _, _ = run_process(
 143             shlex.split(error_scrubber), benchmark_dir, timeout, error)
 144
 145     # Popen in Python 3 returns a bytes object instead of a string for
 146     # stdout/stderr.
 147     if isinstance(output, bytes):
 148         output = output.decode()
 149     if isinstance(error, bytes):
 150         error = error.decode()
 151     return (output.strip(), error.strip(), exit_status)
 152
 153
 154 def run_regression(unsat_cores, proofs, dump, use_skip_return_code, wrapper,
 155                    cvc4_binary, benchmark_path, timeout):
 156     """Determines the expected output for a benchmark, runs CVC4 on it and then
 157     checks whether the output corresponds to the expected output. Optionally
 158     uses a wrapper `wrapper`, tests unsat cores (if unsat_cores is true),
 159     checks proofs (if proofs is true), or dumps a benchmark and uses that as
 160     the input (if dump is true). `use_skip_return_code` enables/disables
 161     returning 77 when a test is skipped."""
 162
 163     if not os.access(cvc4_binary, os.X_OK):
 164         sys.exit(
 165             '"{}" does not exist or is not executable'.format(cvc4_binary))
 166     if not os.path.isfile(benchmark_path):
 167         sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
 168
 169     cvc4_features = get_cvc4_features(cvc4_binary)
 170
 171     basic_command_line_args = []
 172
 173     benchmark_basename = os.path.basename(benchmark_path)
 174     benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
 175     benchmark_dir = os.path.dirname(benchmark_path)
 176     comment_char = '%'
 177     status_regex = None
 178     logic_regex = None
 179     status_to_output = lambda s: s
 180     if benchmark_ext == '.smt':
 181         status_regex = r':status\s*(sat|unsat)'
 182         comment_char = ';'
 183     elif benchmark_ext == '.smt2':
 184         status_regex = r'set-info\s*:status\s*(sat|unsat)'
 185         logic_regex = r'\(\s*set-logic\s*(.*)\)'
 186         comment_char = ';'
 187     elif benchmark_ext == '.cvc':
 188         pass
 189     elif benchmark_ext == '.p':
 190         status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
 191         status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
 192     elif benchmark_ext == '.sy':
 193         comment_char = ';'
 194         # Do not use proofs/unsat-cores with .sy files
 195         unsat_cores = False
 196         proofs = False
 197     else:
 198         sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
 199             benchmark_basename))
 200
 201     benchmark_lines = None
 202     with open(benchmark_path, 'r') as benchmark_file:
 203         benchmark_lines = benchmark_file.readlines()
 204     benchmark_content = ''.join(benchmark_lines)
 205
 206     # Extract the metadata for the benchmark.
 207     scrubber = None
 208     error_scrubber = None
 209     expected_output = ''
 210     expected_error = ''
 211     expected_exit_status = None
 212     command_lines = []
 213     requires = []
 214     logic = None
 215     for line in benchmark_lines:
 216         # Skip lines that do not start with a comment character.
 217         if line[0] != comment_char:
 218             continue
 219         line = line[1:].lstrip()
 220
 221         if line.startswith(SCRUBBER):
 222             scrubber = line[len(SCRUBBER):].strip()
 223         elif line.startswith(ERROR_SCRUBBER):
 224             error_scrubber = line[len(ERROR_SCRUBBER):].strip()
 225         elif line.startswith(EXPECT):
 226             expected_output += line[len(EXPECT):].strip() + '\n'
 227         elif line.startswith(EXPECT_ERROR):
 228             expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
 229         elif line.startswith(EXIT):
 230             expected_exit_status = int(line[len(EXIT):].strip())
 231         elif line.startswith(COMMAND_LINE):
 232             command_lines.append(line[len(COMMAND_LINE):].strip())
 233         elif line.startswith(REQUIRES):
 234             requires.append(line[len(REQUIRES):].strip())
 235     expected_output = expected_output.strip()
 236     expected_error = expected_error.strip()
 237
 238     # Expected output/expected error has not been defined in the metadata for
 239     # the benchmark. Try to extract the information from the benchmark itself.
 240     if expected_output == '' and expected_error == '':
 241         match = None
 242         if status_regex:
 243             match = re.findall(status_regex, benchmark_content)
 244
 245         if match:
 246             expected_output = status_to_output('\n'.join(match))
 247         elif expected_exit_status is None:
 248             # If there is no expected output/error and the exit status has not
 249             # been set explicitly, the benchmark is invalid.
 250             sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
 251     if expected_exit_status is None:
 252         expected_exit_status = 0
 253     if logic_regex:
 254         logic_match = re.findall(logic_regex, benchmark_content)
 255         if logic_match and len(logic_match) == 1:
 256             logic = logic_match[0]
 257
 258     if 'CVC4_REGRESSION_ARGS' in os.environ:
 259         basic_command_line_args += shlex.split(
 260             os.environ['CVC4_REGRESSION_ARGS'])
 261
 262     if not unsat_cores and ('(get-unsat-core)' in benchmark_content
 263                             or '(get-unsat-assumptions)' in benchmark_content):
 264         print(
 265             '1..0 # Skipped regression: unsat cores not supported without proof support'
 266         )
 267         return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 268
 269     for req_feature in requires:
 270         if req_feature.startswith("no-"):
 271             inv_feature = req_feature[len("no-"):]
 272             if inv_feature in cvc4_features:
 273                 print('1..0 # Skipped regression: not valid with {}'.format(
 274                     inv_feature))
 275                 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 276         elif req_feature not in cvc4_features:
 277             print('1..0 # Skipped regression: {} not supported'.format(
 278                 req_feature))
 279             return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 280
 281     if not command_lines:
 282         command_lines.append('')
 283
 284     command_line_args_configs = []
 285     for command_line in command_lines:
 286         args = shlex.split(command_line)
 287         all_args = basic_command_line_args + args
 288
 289         if not unsat_cores and ('--check-unsat-cores' in all_args):
 290             print(
 291                 '# Skipped command line options ({}): unsat cores not supported without proof support'
 292                 .format(all_args))
 293             continue
 294         if not proofs and ('--check-proofs' in all_args
 295                            or '--dump-proofs' in all_args):
 296             print(
 297                 '# Skipped command line options ({}): checking proofs not supported without LFSC support'
 298                 .format(all_args))
 299             continue
 300
 301         command_line_args_configs.append(all_args)
 302
 303         extra_command_line_args = []
 304         if benchmark_ext == '.sy' and \
 305             '--no-check-synth-sol' not in all_args and \
 306             '--sygus-rr' not in all_args and \
 307             '--check-synth-sol' not in all_args:
 308             extra_command_line_args = ['--check-synth-sol']
 309         if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
 310            '--no-check-models' not in all_args and \
 311            '--check-models' not in all_args:
 312             extra_command_line_args = ['--check-models']
 313         if proofs and re.search(r'^(unsat|valid)$', expected_output):
 314             if '--no-check-proofs' not in all_args and \
 315                '--check-proofs' not in all_args and \
 316                '--incremental' not in all_args and \
 317                '--unconstrained-simp' not in all_args and \
 318                logic_supported_with_proofs(logic) and \
 319                not cvc4_binary.endswith('pcvc4'):
 320                 extra_command_line_args = ['--check-proofs']
 321         if unsat_cores and re.search(r'^(unsat|valid)$', expected_output):
 322             if '--no-check-unsat-cores' not in all_args and \
 323                '--check-unsat-cores' not in all_args and \
 324                '--incremental' not in all_args and \
 325                '--unconstrained-simp' not in all_args and \
 326                not cvc4_binary.endswith('pcvc4'):
 327                 extra_command_line_args += ['--check-unsat-cores']
 328         if '--no-check-abducts' not in all_args and \
 329             '--check-abducts' not in all_args and \
 330             not cvc4_binary.endswith('pcvc4'):
 331             extra_command_line_args += ['--check-abducts']
 332         if extra_command_line_args:
 333             command_line_args_configs.append(all_args +
 334                                              extra_command_line_args)
 335
 336     # Run CVC4 on the benchmark with the different option sets and check
 337     # whether the exit status, stdout output, stderr output are as expected.
 338     print('1..{}'.format(len(command_line_args_configs)))
 339     print('# Starting')
 340     exit_code = EXIT_OK
 341     for command_line_args in command_line_args_configs:
 342         output, error, exit_status = run_benchmark(
 343             dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 344             command_line_args, benchmark_dir, benchmark_basename, timeout)
 345         if output != expected_output:
 346             exit_code = EXIT_FAILURE
 347             print(
 348                 'not ok - Differences between expected and actual output on stdout - Flags: {}'
 349                 .format(command_line_args))
 350             for line in difflib.context_diff(output.splitlines(),
 351                                              expected_output.splitlines()):
 352                 print(line)
 353             print()
 354             print('Error output:')
 355             print(error)
 356         elif error != expected_error:
 357             exit_code = EXIT_FAILURE
 358             print(
 359                 'not ok - Differences between expected and actual output on stderr - Flags: {}'
 360                 .format(command_line_args))
 361             for line in difflib.context_diff(error.splitlines(),
 362                                              expected_error.splitlines()):
 363                 print(line)
 364         elif expected_exit_status != exit_status:
 365             exit_code = EXIT_FAILURE
 366             print(
 367                 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
 368                 format(expected_exit_status, exit_status, command_line_args))
 369             print()
 370             print('Output:')
 371             print(output)
 372             print()
 373             print('Error output:')
 374             print(error)
 375         else:
 376             print('ok - Flags: {}'.format(command_line_args))
 377
 378     return exit_code
 379
 380
 381 def main():
 382     """Parses the command line arguments and then calls the core of the
 383     script."""
 384
 385     parser = argparse.ArgumentParser(
 386         description=
 387         'Runs benchmark and checks for correct exit status and output.')
 388     parser.add_argument('--enable-proof', action='store_true')
 389     parser.add_argument('--with-lfsc', action='store_true')
 390     parser.add_argument('--dump', action='store_true')
 391     parser.add_argument('--use-skip-return-code', action='store_true')
 392     parser.add_argument('wrapper', nargs='*')
 393     parser.add_argument('cvc4_binary')
 394     parser.add_argument('benchmark')
 395     args = parser.parse_args()
 396     cvc4_binary = os.path.abspath(args.cvc4_binary)
 397
 398     wrapper = args.wrapper
 399     if os.environ.get('VALGRIND') == '1' and not wrapper:
 400         wrapper = ['libtool', '--mode=execute', 'valgrind']
 401
 402     timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
 403
 404     return run_regression(args.enable_proof, args.with_lfsc, args.dump,
 405                           args.use_skip_return_code, wrapper, cvc4_binary,
 406                           args.benchmark, timeout)
 407
 408
 409 if __name__ == "__main__":
 410     exit_code = main()
 411     sys.exit(exit_code)