test/regress/run_regression.py

   1 #!/usr/bin/env python3
   2 #####################
   3 ## run_regression.py
   4 ## Top contributors (to current version):
   5 ##   Andres Noetzli, Yoni Zohar, Mathias Preiner
   6 ## This file is part of the CVC4 project.
   7 ## Copyright (c) 2009-2020 by the authors listed in the file AUTHORS
   8 ## in the top-level source directory and their institutional affiliations.
   9 ## All rights reserved.  See the file COPYING in the top-level source
  10 ## directory for licensing information.
  11 ##
  12 """
  13 Usage:
  14
  15     run_regression.py [--enable-proof] [--with-lfsc] [--dump]
  16         [--use-skip-return-code] [wrapper] cvc4-binary
  17         [benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p]
  18
  19 Runs benchmark and checks for correct exit status and output.
  20 """
  21
  22 import argparse
  23 import difflib
  24 import os
  25 import re
  26 import shlex
  27 import subprocess
  28 import sys
  29 import threading
  30
  31
  32 class Color:
  33     BLUE = '\033[94m'
  34     GREEN = '\033[92m'
  35     YELLOW = '\033[93m'
  36     RED = '\033[91m'
  37     ENDC = '\033[0m'
  38
  39
  40 SCRUBBER = 'SCRUBBER:'
  41 ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
  42 EXPECT = 'EXPECT:'
  43 EXPECT_ERROR = 'EXPECT-ERROR:'
  44 EXIT = 'EXIT:'
  45 COMMAND_LINE = 'COMMAND-LINE:'
  46 REQUIRES = 'REQUIRES:'
  47
  48 EXIT_OK = 0
  49 EXIT_FAILURE = 1
  50 EXIT_SKIP = 77
  51 STATUS_TIMEOUT = 124
  52
  53
  54 def print_colored(color, text):
  55     """Prints `text` in color `color`."""
  56
  57     for line in text.splitlines():
  58         print(color + line + Color.ENDC)
  59
  60
  61 def print_diff(actual, expected):
  62     """Prints the difference between `actual` and `expected`."""
  63
  64     for line in difflib.unified_diff(expected.splitlines(),
  65                                      actual.splitlines(),
  66                                      'expected',
  67                                      'actual',
  68                                      lineterm=''):
  69         if line.startswith('+'):
  70             print_colored(Color.GREEN, line)
  71         elif line.startswith('-'):
  72             print_colored(Color.RED, line)
  73         else:
  74             print(line)
  75
  76
  77 def run_process(args, cwd, timeout, s_input=None):
  78     """Runs a process with a timeout `timeout` in seconds. `args` are the
  79     arguments to execute, `cwd` is the working directory and `s_input` is the
  80     input to be sent to the process over stdin. Returns the output, the error
  81     output and the exit code of the process. If the process times out, the
  82     output and the error output are empty and the exit code is 124."""
  83
  84     proc = subprocess.Popen(args,
  85                             cwd=cwd,
  86                             stdin=subprocess.PIPE,
  87                             stdout=subprocess.PIPE,
  88                             stderr=subprocess.PIPE)
  89
  90     out = ''
  91     err = ''
  92     exit_status = STATUS_TIMEOUT
  93     try:
  94         if timeout:
  95             timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
  96             timer.start()
  97         out, err = proc.communicate(input=s_input)
  98         exit_status = proc.returncode
  99     finally:
 100         if timeout:
 101             # The timer killed the process and is not active anymore.
 102             if exit_status == -9 and not timer.is_alive():
 103                 exit_status = STATUS_TIMEOUT
 104             timer.cancel()
 105
 106     return out, err, exit_status
 107
 108
 109 def get_cvc4_features(cvc4_binary):
 110     """Returns a list of features supported by the CVC4 binary `cvc4_binary`."""
 111
 112     output, _, _ = run_process([cvc4_binary, '--show-config'], None, None)
 113     if isinstance(output, bytes):
 114         output = output.decode()
 115
 116     features = []
 117     disabled_features = []
 118     for line in output.split('\n'):
 119         tokens = [t.strip() for t in line.split(':')]
 120         if len(tokens) == 2:
 121             key, value = tokens
 122             if value == 'yes':
 123                 features.append(key)
 124             elif value == 'no':
 125                 disabled_features.append(key)
 126
 127     return features, disabled_features
 128
 129
 130 def logic_supported_with_proofs(logic):
 131     assert logic is None or isinstance(logic, str)
 132     return logic in [
 133         #single theories
 134         "QF_BV",
 135         "QF_UF",
 136         "QF_A",
 137         "QF_LRA",
 138         #two theories
 139         "QF_UFBV",
 140         "QF_UFLRA",
 141         "QF_AUF",
 142         "QF_ALRA",
 143         "QF_ABV",
 144         "QF_BVLRA"
 145         #three theories
 146         "QF_AUFBV",
 147         "QF_ABVLRA",
 148         "QF_UFBVLRA",
 149         "QF_AUFLRA",
 150         #four theories
 151         "QF_AUFBVLRA"
 152     ]
 153
 154
 155 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 156                   command_line, benchmark_dir, benchmark_filename, timeout):
 157     """Runs CVC4 on the file `benchmark_filename` in the directory
 158     `benchmark_dir` using the binary `cvc4_binary` with the command line
 159     options `command_line`. The output is scrubbed using `scrubber` and
 160     `error_scrubber` for stdout and stderr, respectively. If dump is true, the
 161     function first uses CVC4 to read in and dump the benchmark file and then
 162     uses that as input."""
 163
 164     bin_args = wrapper[:]
 165     bin_args.append(cvc4_binary)
 166
 167     output = None
 168     error = None
 169     exit_status = None
 170     if dump:
 171         dump_args = [
 172             '--preprocess-only', '--dump', 'raw-benchmark',
 173             '--output-lang=smt2', '-qq'
 174         ]
 175         dump_output, _, _ = run_process(
 176             bin_args + command_line + dump_args + [benchmark_filename],
 177             benchmark_dir, timeout)
 178         output, error, exit_status = run_process(
 179             bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
 180             timeout, dump_output)
 181     else:
 182         output, error, exit_status = run_process(
 183             bin_args + command_line + [benchmark_filename], benchmark_dir,
 184             timeout)
 185
 186     # If a scrubber command has been specified then apply it to the output.
 187     if scrubber:
 188         output, _, _ = run_process(shlex.split(scrubber), benchmark_dir,
 189                                    timeout, output)
 190     if error_scrubber:
 191         error, _, _ = run_process(shlex.split(error_scrubber), benchmark_dir,
 192                                   timeout, error)
 193
 194     # Popen in Python 3 returns a bytes object instead of a string for
 195     # stdout/stderr.
 196     if isinstance(output, bytes):
 197         output = output.decode()
 198     if isinstance(error, bytes):
 199         error = error.decode()
 200     return (output.strip(), error.strip(), exit_status)
 201
 202
 203 def run_regression(unsat_cores, proofs, dump, use_skip_return_code,
 204                    skip_timeout, wrapper, cvc4_binary, benchmark_path,
 205                    timeout):
 206     """Determines the expected output for a benchmark, runs CVC4 on it and then
 207     checks whether the output corresponds to the expected output. Optionally
 208     uses a wrapper `wrapper`, tests unsat cores (if unsat_cores is true),
 209     checks proofs (if proofs is true), or dumps a benchmark and uses that as
 210     the input (if dump is true). `use_skip_return_code` enables/disables
 211     returning 77 when a test is skipped."""
 212
 213     if not os.access(cvc4_binary, os.X_OK):
 214         sys.exit(
 215             '"{}" does not exist or is not executable'.format(cvc4_binary))
 216     if not os.path.isfile(benchmark_path):
 217         sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
 218
 219     cvc4_features, cvc4_disabled_features = get_cvc4_features(cvc4_binary)
 220
 221     basic_command_line_args = []
 222
 223     benchmark_basename = os.path.basename(benchmark_path)
 224     benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
 225     benchmark_dir = os.path.dirname(benchmark_path)
 226     comment_char = '%'
 227     status_regex = None
 228     logic_regex = None
 229     status_to_output = lambda s: s
 230     if benchmark_ext == '.smt':
 231         status_regex = r':status\s*(sat|unsat)'
 232         comment_char = ';'
 233     elif benchmark_ext == '.smt2':
 234         status_regex = r'set-info\s*:status\s*(sat|unsat)'
 235         logic_regex = r'\(\s*set-logic\s*(.*)\)'
 236         comment_char = ';'
 237     elif benchmark_ext == '.cvc':
 238         pass
 239     elif benchmark_ext == '.p':
 240         status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
 241         status_to_output = lambda s: '% SZS status {} for {}'.format(
 242             s, benchmark_filename)
 243     elif benchmark_ext == '.sy':
 244         comment_char = ';'
 245         # Do not use proofs/unsat-cores with .sy files
 246         unsat_cores = False
 247         proofs = False
 248     else:
 249         sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
 250             benchmark_basename))
 251
 252     benchmark_lines = None
 253     with open(benchmark_path, 'r') as benchmark_file:
 254         benchmark_lines = benchmark_file.readlines()
 255     benchmark_content = ''.join(benchmark_lines)
 256
 257     # Extract the metadata for the benchmark.
 258     scrubber = None
 259     error_scrubber = None
 260     expected_output = ''
 261     expected_error = ''
 262     expected_exit_status = None
 263     command_lines = []
 264     requires = []
 265     logic = None
 266     for line in benchmark_lines:
 267         # Skip lines that do not start with a comment character.
 268         if line[0] != comment_char:
 269             continue
 270         line = line[1:].lstrip()
 271
 272         if line.startswith(SCRUBBER):
 273             scrubber = line[len(SCRUBBER):].strip()
 274         elif line.startswith(ERROR_SCRUBBER):
 275             error_scrubber = line[len(ERROR_SCRUBBER):].strip()
 276         elif line.startswith(EXPECT):
 277             expected_output += line[len(EXPECT):].strip() + '\n'
 278         elif line.startswith(EXPECT_ERROR):
 279             expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
 280         elif line.startswith(EXIT):
 281             expected_exit_status = int(line[len(EXIT):].strip())
 282         elif line.startswith(COMMAND_LINE):
 283             command_lines.append(line[len(COMMAND_LINE):].strip())
 284         elif line.startswith(REQUIRES):
 285             requires.append(line[len(REQUIRES):].strip())
 286     expected_output = expected_output.strip()
 287     expected_error = expected_error.strip()
 288
 289     # Expected output/expected error has not been defined in the metadata for
 290     # the benchmark. Try to extract the information from the benchmark itself.
 291     if expected_output == '' and expected_error == '':
 292         match = None
 293         if status_regex:
 294             match = re.findall(status_regex, benchmark_content)
 295
 296         if match:
 297             expected_output = status_to_output('\n'.join(match))
 298         elif expected_exit_status is None:
 299             # If there is no expected output/error and the exit status has not
 300             # been set explicitly, the benchmark is invalid.
 301             sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
 302     if expected_exit_status is None:
 303         expected_exit_status = 0
 304     if logic_regex:
 305         logic_match = re.findall(logic_regex, benchmark_content)
 306         if logic_match and len(logic_match) == 1:
 307             logic = logic_match[0]
 308
 309     if 'CVC4_REGRESSION_ARGS' in os.environ:
 310         basic_command_line_args += shlex.split(
 311             os.environ['CVC4_REGRESSION_ARGS'])
 312
 313     if not unsat_cores and ('(get-unsat-core)' in benchmark_content
 314                             or '(get-unsat-assumptions)' in benchmark_content):
 315         print(
 316             '1..0 # Skipped regression: unsat cores not supported without proof support'
 317         )
 318         return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 319
 320     for req_feature in requires:
 321         is_negative = False
 322         if req_feature.startswith("no-"):
 323             req_feature = req_feature[len("no-"):]
 324             is_negative = True
 325         if req_feature not in (cvc4_features + cvc4_disabled_features):
 326             print(
 327                 'Illegal requirement in regression: {}\nAllowed requirements: {}'
 328                 .format(req_feature,
 329                         ' '.join(cvc4_features + cvc4_disabled_features)))
 330             return EXIT_FAILURE
 331         if is_negative:
 332             if req_feature in cvc4_features:
 333                 print('1..0 # Skipped regression: not valid with {}'.format(
 334                     req_feature))
 335                 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 336         elif req_feature not in cvc4_features:
 337             print('1..0 # Skipped regression: {} not supported'.format(
 338                 req_feature))
 339             return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 340
 341     if not command_lines:
 342         command_lines.append('')
 343
 344     command_line_args_configs = []
 345     for command_line in command_lines:
 346         args = shlex.split(command_line)
 347         all_args = basic_command_line_args + args
 348
 349         if not unsat_cores and ('--check-unsat-cores' in all_args):
 350             print(
 351                 '# Skipped command line options ({}): unsat cores not supported without proof support'
 352                 .format(all_args))
 353             continue
 354         if not proofs and '--dump-proofs' in all_args:
 355             print(
 356                 '# Skipped command line options ({}): proof production not supported without LFSC support'
 357                 .format(all_args))
 358             continue
 359
 360         command_line_args_configs.append(all_args)
 361
 362         extra_command_line_args = []
 363         if benchmark_ext == '.sy' and \
 364             '--no-check-synth-sol' not in all_args and \
 365             '--sygus-rr' not in all_args and \
 366             '--check-synth-sol' not in all_args:
 367             extra_command_line_args = ['--check-synth-sol']
 368         if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
 369            '--no-debug-check-models' not in all_args and \
 370            '--no-check-models' not in all_args and \
 371            '--debug-check-models' not in all_args:
 372             extra_command_line_args = ['--debug-check-models']
 373         if unsat_cores and re.search(r'^(unsat|valid)$', expected_output):
 374             if '--no-check-unsat-cores' not in all_args and \
 375                '--check-unsat-cores' not in all_args and \
 376                '--incremental' not in all_args and \
 377                '--unconstrained-simp' not in all_args:
 378                 extra_command_line_args += ['--check-unsat-cores']
 379         if '--no-check-abducts' not in all_args and \
 380             '--check-abducts' not in all_args:
 381             extra_command_line_args += ['--check-abducts']
 382         if extra_command_line_args:
 383             command_line_args_configs.append(all_args +
 384                                              extra_command_line_args)
 385
 386     # Run CVC4 on the benchmark with the different option sets and check
 387     # whether the exit status, stdout output, stderr output are as expected.
 388     print('1..{}'.format(len(command_line_args_configs)))
 389     print('# Starting')
 390     exit_code = EXIT_OK
 391     for command_line_args in command_line_args_configs:
 392         output, error, exit_status = run_benchmark(dump, wrapper, scrubber,
 393                                                    error_scrubber, cvc4_binary,
 394                                                    command_line_args,
 395                                                    benchmark_dir,
 396                                                    benchmark_basename, timeout)
 397         output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE)
 398         error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE)
 399         if exit_status == STATUS_TIMEOUT:
 400             exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE
 401             print('Timeout - Flags: {}'.format(command_line_args))
 402         elif output != expected_output:
 403             exit_code = EXIT_FAILURE
 404             print('not ok - Flags: {}'.format(command_line_args))
 405             print()
 406             print('Standard output difference')
 407             print('=' * 80)
 408             print_diff(output, expected_output)
 409             print('=' * 80)
 410             print()
 411             print()
 412             if error:
 413                 print('Error output')
 414                 print('=' * 80)
 415                 print_colored(Color.YELLOW, error)
 416                 print('=' * 80)
 417                 print()
 418         elif error != expected_error:
 419             exit_code = EXIT_FAILURE
 420             print(
 421                 'not ok - Differences between expected and actual output on stderr - Flags: {}'
 422                 .format(command_line_args))
 423             print()
 424             print('Error output difference')
 425             print('=' * 80)
 426             print_diff(error, expected_error)
 427             print('=' * 80)
 428             print()
 429         elif expected_exit_status != exit_status:
 430             exit_code = EXIT_FAILURE
 431             print(
 432                 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
 433                 format(expected_exit_status, exit_status, command_line_args))
 434             print()
 435             print('Output:')
 436             print('=' * 80)
 437             print_colored(Color.BLUE, output)
 438             print('=' * 80)
 439             print()
 440             print()
 441             print('Error output:')
 442             print('=' * 80)
 443             print_colored(Color.YELLOW, error)
 444             print('=' * 80)
 445             print()
 446         else:
 447             print('ok - Flags: {}'.format(command_line_args))
 448
 449     return exit_code
 450
 451
 452 def main():
 453     """Parses the command line arguments and then calls the core of the
 454     script."""
 455
 456     parser = argparse.ArgumentParser(
 457         description=
 458         'Runs benchmark and checks for correct exit status and output.')
 459     parser.add_argument('--enable-proof', action='store_true')
 460     parser.add_argument('--with-lfsc', action='store_true')
 461     parser.add_argument('--dump', action='store_true')
 462     parser.add_argument('--use-skip-return-code', action='store_true')
 463     parser.add_argument('--skip-timeout', action='store_true')
 464     parser.add_argument('wrapper', nargs='*')
 465     parser.add_argument('cvc4_binary')
 466     parser.add_argument('benchmark')
 467     args = parser.parse_args()
 468     cvc4_binary = os.path.abspath(args.cvc4_binary)
 469
 470     wrapper = args.wrapper
 471     if os.environ.get('VALGRIND') == '1' and not wrapper:
 472         wrapper = ['libtool', '--mode=execute', 'valgrind']
 473
 474     timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
 475
 476     return run_regression(args.enable_proof, args.with_lfsc, args.dump,
 477                           args.use_skip_return_code, args.skip_timeout,
 478                           wrapper, cvc4_binary, args.benchmark, timeout)
 479
 480
 481 if __name__ == "__main__":
 482     exit_code = main()
 483     sys.exit(exit_code)