test/regress/run_regression.py

   1 #!/usr/bin/env python3
   2 #####################
   3 ## run_regression.py
   4 ## Top contributors (to current version):
   5 ##   Andres Noetzli, Yoni Zohar, Mathias Preiner
   6 ## This file is part of the CVC4 project.
   7 ## Copyright (c) 2009-2020 by the authors listed in the file AUTHORS
   8 ## in the top-level source directory and their institutional affiliations.
   9 ## All rights reserved.  See the file COPYING in the top-level source
  10 ## directory for licensing information.
  11 ##
  12 """
  13 Usage:
  14
  15     run_regression.py [--enable-proof] [--with-lfsc] [--dump]
  16         [--use-skip-return-code] [wrapper] cvc4-binary
  17         [benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p]
  18
  19 Runs benchmark and checks for correct exit status and output.
  20 """
  21
  22 import argparse
  23 import difflib
  24 import os
  25 import re
  26 import shlex
  27 import subprocess
  28 import sys
  29 import threading
  30
  31
  32 class Color:
  33     BLUE = '\033[94m'
  34     GREEN = '\033[92m'
  35     YELLOW = '\033[93m'
  36     RED = '\033[91m'
  37     ENDC = '\033[0m'
  38
  39
  40 SCRUBBER = 'SCRUBBER:'
  41 ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
  42 EXPECT = 'EXPECT:'
  43 EXPECT_ERROR = 'EXPECT-ERROR:'
  44 EXIT = 'EXIT:'
  45 COMMAND_LINE = 'COMMAND-LINE:'
  46 REQUIRES = 'REQUIRES:'
  47
  48 EXIT_OK = 0
  49 EXIT_FAILURE = 1
  50 EXIT_SKIP = 77
  51 STATUS_TIMEOUT = 124
  52
  53
  54 def print_colored(color, text):
  55     """Prints `text` in color `color`."""
  56
  57     for line in text.splitlines():
  58         print(color + line + Color.ENDC)
  59
  60
  61 def print_diff(actual, expected):
  62     """Prints the difference between `actual` and `expected`."""
  63
  64     for line in difflib.unified_diff(expected.splitlines(),
  65                                      actual.splitlines(),
  66                                      'expected',
  67                                      'actual',
  68                                      lineterm=''):
  69         if line.startswith('+'):
  70             print_colored(Color.GREEN, line)
  71         elif line.startswith('-'):
  72             print_colored(Color.RED, line)
  73         else:
  74             print(line)
  75
  76
  77 def run_process(args, cwd, timeout, s_input=None):
  78     """Runs a process with a timeout `timeout` in seconds. `args` are the
  79     arguments to execute, `cwd` is the working directory and `s_input` is the
  80     input to be sent to the process over stdin. Returns the output, the error
  81     output and the exit code of the process. If the process times out, the
  82     output and the error output are empty and the exit code is 124."""
  83
  84     proc = subprocess.Popen(args,
  85                             cwd=cwd,
  86                             stdin=subprocess.PIPE,
  87                             stdout=subprocess.PIPE,
  88                             stderr=subprocess.PIPE)
  89
  90     out = ''
  91     err = ''
  92     exit_status = STATUS_TIMEOUT
  93     try:
  94         if timeout:
  95             timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
  96             timer.start()
  97         out, err = proc.communicate(input=s_input)
  98         exit_status = proc.returncode
  99     finally:
 100         if timeout:
 101             # The timer killed the process and is not active anymore.
 102             if exit_status == -9 and not timer.is_alive():
 103                 exit_status = STATUS_TIMEOUT
 104             timer.cancel()
 105
 106     return out, err, exit_status
 107
 108
 109 def get_cvc4_features(cvc4_binary):
 110     """Returns a list of features supported by the CVC4 binary `cvc4_binary`."""
 111
 112     output, _, _ = run_process([cvc4_binary, '--show-config'], None, None)
 113     if isinstance(output, bytes):
 114         output = output.decode()
 115
 116     features = []
 117     for line in output.split('\n'):
 118         tokens = [t.strip() for t in line.split(':')]
 119         if len(tokens) == 2:
 120             key, value = tokens
 121             if value == 'yes':
 122                 features.append(key)
 123
 124     return features
 125
 126
 127 def logic_supported_with_proofs(logic):
 128     assert logic is None or isinstance(logic, str)
 129     return logic in [
 130         #single theories
 131         "QF_BV",
 132         "QF_UF",
 133         "QF_A",
 134         "QF_LRA",
 135         #two theories
 136         "QF_UFBV",
 137         "QF_UFLRA",
 138         "QF_AUF",
 139         "QF_ALRA",
 140         "QF_ABV",
 141         "QF_BVLRA"
 142         #three theories
 143         "QF_AUFBV",
 144         "QF_ABVLRA",
 145         "QF_UFBVLRA",
 146         "QF_AUFLRA",
 147         #four theories
 148         "QF_AUFBVLRA"
 149     ]
 150
 151
 152 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 153                   command_line, benchmark_dir, benchmark_filename, timeout):
 154     """Runs CVC4 on the file `benchmark_filename` in the directory
 155     `benchmark_dir` using the binary `cvc4_binary` with the command line
 156     options `command_line`. The output is scrubbed using `scrubber` and
 157     `error_scrubber` for stdout and stderr, respectively. If dump is true, the
 158     function first uses CVC4 to read in and dump the benchmark file and then
 159     uses that as input."""
 160
 161     bin_args = wrapper[:]
 162     bin_args.append(cvc4_binary)
 163
 164     output = None
 165     error = None
 166     exit_status = None
 167     if dump:
 168         dump_args = [
 169             '--preprocess-only', '--dump', 'raw-benchmark',
 170             '--output-lang=smt2', '-qq'
 171         ]
 172         dump_output, _, _ = run_process(
 173             bin_args + command_line + dump_args + [benchmark_filename],
 174             benchmark_dir, timeout)
 175         output, error, exit_status = run_process(
 176             bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
 177             timeout, dump_output)
 178     else:
 179         output, error, exit_status = run_process(
 180             bin_args + command_line + [benchmark_filename], benchmark_dir,
 181             timeout)
 182
 183     # If a scrubber command has been specified then apply it to the output.
 184     if scrubber:
 185         output, _, _ = run_process(shlex.split(scrubber), benchmark_dir,
 186                                    timeout, output)
 187     if error_scrubber:
 188         error, _, _ = run_process(shlex.split(error_scrubber), benchmark_dir,
 189                                   timeout, error)
 190
 191     # Popen in Python 3 returns a bytes object instead of a string for
 192     # stdout/stderr.
 193     if isinstance(output, bytes):
 194         output = output.decode()
 195     if isinstance(error, bytes):
 196         error = error.decode()
 197     return (output.strip(), error.strip(), exit_status)
 198
 199
 200 def run_regression(unsat_cores, proofs, dump, use_skip_return_code,
 201                    skip_timeout, wrapper, cvc4_binary, benchmark_path,
 202                    timeout):
 203     """Determines the expected output for a benchmark, runs CVC4 on it and then
 204     checks whether the output corresponds to the expected output. Optionally
 205     uses a wrapper `wrapper`, tests unsat cores (if unsat_cores is true),
 206     checks proofs (if proofs is true), or dumps a benchmark and uses that as
 207     the input (if dump is true). `use_skip_return_code` enables/disables
 208     returning 77 when a test is skipped."""
 209
 210     if not os.access(cvc4_binary, os.X_OK):
 211         sys.exit(
 212             '"{}" does not exist or is not executable'.format(cvc4_binary))
 213     if not os.path.isfile(benchmark_path):
 214         sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
 215
 216     cvc4_features = get_cvc4_features(cvc4_binary)
 217
 218     basic_command_line_args = []
 219
 220     benchmark_basename = os.path.basename(benchmark_path)
 221     benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
 222     benchmark_dir = os.path.dirname(benchmark_path)
 223     comment_char = '%'
 224     status_regex = None
 225     logic_regex = None
 226     status_to_output = lambda s: s
 227     if benchmark_ext == '.smt':
 228         status_regex = r':status\s*(sat|unsat)'
 229         comment_char = ';'
 230     elif benchmark_ext == '.smt2':
 231         status_regex = r'set-info\s*:status\s*(sat|unsat)'
 232         logic_regex = r'\(\s*set-logic\s*(.*)\)'
 233         comment_char = ';'
 234     elif benchmark_ext == '.cvc':
 235         pass
 236     elif benchmark_ext == '.p':
 237         status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
 238         status_to_output = lambda s: '% SZS status {} for {}'.format(
 239             s, benchmark_filename)
 240     elif benchmark_ext == '.sy':
 241         comment_char = ';'
 242         # Do not use proofs/unsat-cores with .sy files
 243         unsat_cores = False
 244         proofs = False
 245     else:
 246         sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
 247             benchmark_basename))
 248
 249     benchmark_lines = None
 250     with open(benchmark_path, 'r') as benchmark_file:
 251         benchmark_lines = benchmark_file.readlines()
 252     benchmark_content = ''.join(benchmark_lines)
 253
 254     # Extract the metadata for the benchmark.
 255     scrubber = None
 256     error_scrubber = None
 257     expected_output = ''
 258     expected_error = ''
 259     expected_exit_status = None
 260     command_lines = []
 261     requires = []
 262     logic = None
 263     for line in benchmark_lines:
 264         # Skip lines that do not start with a comment character.
 265         if line[0] != comment_char:
 266             continue
 267         line = line[1:].lstrip()
 268
 269         if line.startswith(SCRUBBER):
 270             scrubber = line[len(SCRUBBER):].strip()
 271         elif line.startswith(ERROR_SCRUBBER):
 272             error_scrubber = line[len(ERROR_SCRUBBER):].strip()
 273         elif line.startswith(EXPECT):
 274             expected_output += line[len(EXPECT):].strip() + '\n'
 275         elif line.startswith(EXPECT_ERROR):
 276             expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
 277         elif line.startswith(EXIT):
 278             expected_exit_status = int(line[len(EXIT):].strip())
 279         elif line.startswith(COMMAND_LINE):
 280             command_lines.append(line[len(COMMAND_LINE):].strip())
 281         elif line.startswith(REQUIRES):
 282             requires.append(line[len(REQUIRES):].strip())
 283     expected_output = expected_output.strip()
 284     expected_error = expected_error.strip()
 285
 286     # Expected output/expected error has not been defined in the metadata for
 287     # the benchmark. Try to extract the information from the benchmark itself.
 288     if expected_output == '' and expected_error == '':
 289         match = None
 290         if status_regex:
 291             match = re.findall(status_regex, benchmark_content)
 292
 293         if match:
 294             expected_output = status_to_output('\n'.join(match))
 295         elif expected_exit_status is None:
 296             # If there is no expected output/error and the exit status has not
 297             # been set explicitly, the benchmark is invalid.
 298             sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
 299     if expected_exit_status is None:
 300         expected_exit_status = 0
 301     if logic_regex:
 302         logic_match = re.findall(logic_regex, benchmark_content)
 303         if logic_match and len(logic_match) == 1:
 304             logic = logic_match[0]
 305
 306     if 'CVC4_REGRESSION_ARGS' in os.environ:
 307         basic_command_line_args += shlex.split(
 308             os.environ['CVC4_REGRESSION_ARGS'])
 309
 310     if not unsat_cores and ('(get-unsat-core)' in benchmark_content
 311                             or '(get-unsat-assumptions)' in benchmark_content):
 312         print(
 313             '1..0 # Skipped regression: unsat cores not supported without proof support'
 314         )
 315         return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 316
 317     for req_feature in requires:
 318         if req_feature.startswith("no-"):
 319             inv_feature = req_feature[len("no-"):]
 320             if inv_feature in cvc4_features:
 321                 print('1..0 # Skipped regression: not valid with {}'.format(
 322                     inv_feature))
 323                 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 324         elif req_feature not in cvc4_features:
 325             print('1..0 # Skipped regression: {} not supported'.format(
 326                 req_feature))
 327             return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 328
 329     if not command_lines:
 330         command_lines.append('')
 331
 332     command_line_args_configs = []
 333     for command_line in command_lines:
 334         args = shlex.split(command_line)
 335         all_args = basic_command_line_args + args
 336
 337         if not unsat_cores and ('--check-unsat-cores' in all_args):
 338             print(
 339                 '# Skipped command line options ({}): unsat cores not supported without proof support'
 340                 .format(all_args))
 341             continue
 342         if not proofs and '--dump-proofs' in all_args:
 343             print(
 344                 '# Skipped command line options ({}): proof production not supported without LFSC support'
 345                 .format(all_args))
 346             continue
 347
 348         command_line_args_configs.append(all_args)
 349
 350         extra_command_line_args = []
 351         if benchmark_ext == '.sy' and \
 352             '--no-check-synth-sol' not in all_args and \
 353             '--sygus-rr' not in all_args and \
 354             '--check-synth-sol' not in all_args:
 355             extra_command_line_args = ['--check-synth-sol']
 356         if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
 357            '--no-debug-check-models' not in all_args and \
 358            '--no-check-models' not in all_args and \
 359            '--debug-check-models' not in all_args:
 360             extra_command_line_args = ['--debug-check-models']
 361         if unsat_cores and re.search(r'^(unsat|valid)$', expected_output):
 362             if '--no-check-unsat-cores' not in all_args and \
 363                '--check-unsat-cores' not in all_args and \
 364                '--incremental' not in all_args and \
 365                '--unconstrained-simp' not in all_args:
 366                 extra_command_line_args += ['--check-unsat-cores']
 367         if '--no-check-abducts' not in all_args and \
 368             '--check-abducts' not in all_args:
 369             extra_command_line_args += ['--check-abducts']
 370         if extra_command_line_args:
 371             command_line_args_configs.append(all_args +
 372                                              extra_command_line_args)
 373
 374     # Run CVC4 on the benchmark with the different option sets and check
 375     # whether the exit status, stdout output, stderr output are as expected.
 376     print('1..{}'.format(len(command_line_args_configs)))
 377     print('# Starting')
 378     exit_code = EXIT_OK
 379     for command_line_args in command_line_args_configs:
 380         output, error, exit_status = run_benchmark(dump, wrapper, scrubber,
 381                                                    error_scrubber, cvc4_binary,
 382                                                    command_line_args,
 383                                                    benchmark_dir,
 384                                                    benchmark_basename, timeout)
 385         output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE)
 386         error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE)
 387         if exit_status == STATUS_TIMEOUT:
 388             exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE
 389             print('Timeout - Flags: {}'.format(command_line_args))
 390         elif output != expected_output:
 391             exit_code = EXIT_FAILURE
 392             print('not ok - Flags: {}'.format(command_line_args))
 393             print()
 394             print('Standard output difference')
 395             print('=' * 80)
 396             print_diff(output, expected_output)
 397             print('=' * 80)
 398             print()
 399             print()
 400             if error:
 401                 print('Error output')
 402                 print('=' * 80)
 403                 print_colored(Color.YELLOW, error)
 404                 print('=' * 80)
 405                 print()
 406         elif error != expected_error:
 407             exit_code = EXIT_FAILURE
 408             print(
 409                 'not ok - Differences between expected and actual output on stderr - Flags: {}'
 410                 .format(command_line_args))
 411             print()
 412             print('Error output difference')
 413             print('=' * 80)
 414             print_diff(error, expected_error)
 415             print('=' * 80)
 416             print()
 417         elif expected_exit_status != exit_status:
 418             exit_code = EXIT_FAILURE
 419             print(
 420                 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
 421                 format(expected_exit_status, exit_status, command_line_args))
 422             print()
 423             print('Output:')
 424             print('=' * 80)
 425             print_colored(Color.BLUE, output)
 426             print('=' * 80)
 427             print()
 428             print()
 429             print('Error output:')
 430             print('=' * 80)
 431             print_colored(Color.YELLOW, error)
 432             print('=' * 80)
 433             print()
 434         else:
 435             print('ok - Flags: {}'.format(command_line_args))
 436
 437     return exit_code
 438
 439
 440 def main():
 441     """Parses the command line arguments and then calls the core of the
 442     script."""
 443
 444     parser = argparse.ArgumentParser(
 445         description=
 446         'Runs benchmark and checks for correct exit status and output.')
 447     parser.add_argument('--enable-proof', action='store_true')
 448     parser.add_argument('--with-lfsc', action='store_true')
 449     parser.add_argument('--dump', action='store_true')
 450     parser.add_argument('--use-skip-return-code', action='store_true')
 451     parser.add_argument('--skip-timeout', action='store_true')
 452     parser.add_argument('wrapper', nargs='*')
 453     parser.add_argument('cvc4_binary')
 454     parser.add_argument('benchmark')
 455     args = parser.parse_args()
 456     cvc4_binary = os.path.abspath(args.cvc4_binary)
 457
 458     wrapper = args.wrapper
 459     if os.environ.get('VALGRIND') == '1' and not wrapper:
 460         wrapper = ['libtool', '--mode=execute', 'valgrind']
 461
 462     timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
 463
 464     return run_regression(args.enable_proof, args.with_lfsc, args.dump,
 465                           args.use_skip_return_code, args.skip_timeout,
 466                           wrapper, cvc4_binary, args.benchmark, timeout)
 467
 468
 469 if __name__ == "__main__":
 470     exit_code = main()
 471     sys.exit(exit_code)