test/regress/run_regression.py

   1 #!/usr/bin/env python3
   2 ###############################################################################
   3 # Top contributors (to current version):
   4 #   Andres Noetzli, Mathias Preiner, Yoni Zohar
   5 #
   6 # This file is part of the cvc5 project.
   7 #
   8 # Copyright (c) 2009-2021 by the authors listed in the file AUTHORS
   9 # in the top-level source directory and their institutional affiliations.
  10 # All rights reserved.  See the file COPYING in the top-level source
  11 # directory for licensing information.
  12 # #############################################################################
  13 ##
  14
  15 """
  16 Runs benchmark and checks for correct exit status and output.
  17 """
  18
  19 import argparse
  20 import difflib
  21 import os
  22 import re
  23 import shlex
  24 import subprocess
  25 import sys
  26 import threading
  27
  28
  29 class Color:
  30     BLUE = '\033[94m'
  31     GREEN = '\033[92m'
  32     YELLOW = '\033[93m'
  33     RED = '\033[91m'
  34     ENDC = '\033[0m'
  35
  36
  37 SCRUBBER = 'SCRUBBER:'
  38 ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
  39 EXPECT = 'EXPECT:'
  40 EXPECT_ERROR = 'EXPECT-ERROR:'
  41 EXIT = 'EXIT:'
  42 COMMAND_LINE = 'COMMAND-LINE:'
  43 REQUIRES = 'REQUIRES:'
  44
  45 EXIT_OK = 0
  46 EXIT_FAILURE = 1
  47 EXIT_SKIP = 77
  48 STATUS_TIMEOUT = 124
  49
  50
  51 def print_colored(color, text):
  52     """Prints `text` in color `color`."""
  53
  54     for line in text.splitlines():
  55         print(color + line + Color.ENDC)
  56
  57
  58 def print_diff(actual, expected):
  59     """Prints the difference between `actual` and `expected`."""
  60
  61     for line in difflib.unified_diff(expected.splitlines(),
  62                                      actual.splitlines(),
  63                                      'expected',
  64                                      'actual',
  65                                      lineterm=''):
  66         if line.startswith('+'):
  67             print_colored(Color.GREEN, line)
  68         elif line.startswith('-'):
  69             print_colored(Color.RED, line)
  70         else:
  71             print(line)
  72
  73
  74 def run_process(args, cwd, timeout, s_input=None):
  75     """Runs a process with a timeout `timeout` in seconds. `args` are the
  76     arguments to execute, `cwd` is the working directory and `s_input` is the
  77     input to be sent to the process over stdin. Returns the output, the error
  78     output and the exit code of the process. If the process times out, the
  79     output and the error output are empty and the exit code is 124."""
  80
  81     proc = subprocess.Popen(args,
  82                             cwd=cwd,
  83                             stdin=subprocess.PIPE,
  84                             stdout=subprocess.PIPE,
  85                             stderr=subprocess.PIPE)
  86
  87     out = ''
  88     err = ''
  89     exit_status = STATUS_TIMEOUT
  90     try:
  91         if timeout:
  92             timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
  93             timer.start()
  94         out, err = proc.communicate(input=s_input)
  95         exit_status = proc.returncode
  96     finally:
  97         if timeout:
  98             # The timer killed the process and is not active anymore.
  99             if exit_status == -9 and not timer.is_alive():
 100                 exit_status = STATUS_TIMEOUT
 101             timer.cancel()
 102
 103     return out, err, exit_status
 104
 105
 106 def get_cvc4_features(cvc4_binary):
 107     """Returns a list of features supported by the CVC4 binary `cvc4_binary`."""
 108
 109     output, _, _ = run_process([cvc4_binary, '--show-config'], None, None)
 110     if isinstance(output, bytes):
 111         output = output.decode()
 112
 113     features = []
 114     disabled_features = []
 115     for line in output.split('\n'):
 116         tokens = [t.strip() for t in line.split(':')]
 117         if len(tokens) == 2:
 118             key, value = tokens
 119             if value == 'yes':
 120                 features.append(key)
 121             elif value == 'no':
 122                 disabled_features.append(key)
 123
 124     return features, disabled_features
 125
 126
 127 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
 128                   command_line, benchmark_dir, benchmark_filename, timeout):
 129     """Runs CVC4 on the file `benchmark_filename` in the directory
 130     `benchmark_dir` using the binary `cvc4_binary` with the command line
 131     options `command_line`. The output is scrubbed using `scrubber` and
 132     `error_scrubber` for stdout and stderr, respectively. If dump is true, the
 133     function first uses CVC4 to read in and dump the benchmark file and then
 134     uses that as input."""
 135
 136     bin_args = wrapper[:]
 137     bin_args.append(cvc4_binary)
 138
 139     output = None
 140     error = None
 141     exit_status = None
 142     if dump:
 143         dump_args = [
 144             '--preprocess-only', '--dump', 'raw-benchmark',
 145             '--output-lang=smt2', '-qq'
 146         ]
 147         dump_output, _, _ = run_process(
 148             bin_args + command_line + dump_args + [benchmark_filename],
 149             benchmark_dir, timeout)
 150         output, error, exit_status = run_process(
 151             bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
 152             timeout, dump_output)
 153     else:
 154         output, error, exit_status = run_process(
 155             bin_args + command_line + [benchmark_filename], benchmark_dir,
 156             timeout)
 157
 158     # If a scrubber command has been specified then apply it to the output.
 159     if scrubber:
 160         output, _, _ = run_process(shlex.split(scrubber), benchmark_dir,
 161                                    timeout, output)
 162     if error_scrubber:
 163         error, _, _ = run_process(shlex.split(error_scrubber), benchmark_dir,
 164                                   timeout, error)
 165
 166     # Popen in Python 3 returns a bytes object instead of a string for
 167     # stdout/stderr.
 168     if isinstance(output, bytes):
 169         output = output.decode()
 170     if isinstance(error, bytes):
 171         error = error.decode()
 172     return (output.strip(), error.strip(), exit_status)
 173
 174
 175 def run_regression(check_unsat_cores, check_proofs, dump, use_skip_return_code,
 176                    skip_timeout, wrapper, cvc4_binary, benchmark_path,
 177                    timeout):
 178     """Determines the expected output for a benchmark, runs CVC4 on it and then
 179     checks whether the output corresponds to the expected output. Optionally
 180     uses a wrapper `wrapper`, tests unsat cores (if check_unsat_cores is true),
 181     checks proofs (if check_proofs is true), or dumps a benchmark and uses that as
 182     the input (if dump is true). `use_skip_return_code` enables/disables
 183     returning 77 when a test is skipped."""
 184
 185     if not os.access(cvc4_binary, os.X_OK):
 186         sys.exit(
 187             '"{}" does not exist or is not executable'.format(cvc4_binary))
 188     if not os.path.isfile(benchmark_path):
 189         sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
 190
 191     cvc4_features, cvc4_disabled_features = get_cvc4_features(cvc4_binary)
 192
 193     basic_command_line_args = []
 194
 195     benchmark_basename = os.path.basename(benchmark_path)
 196     benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
 197     benchmark_dir = os.path.dirname(benchmark_path)
 198     comment_char = '%'
 199     status_regex = None
 200     status_to_output = lambda s: s
 201     if benchmark_ext == '.smt':
 202         status_regex = r':status\s*(sat|unsat)'
 203         comment_char = ';'
 204     elif benchmark_ext == '.smt2':
 205         status_regex = r'set-info\s*:status\s*(sat|unsat)'
 206         comment_char = ';'
 207     elif benchmark_ext == '.cvc':
 208         pass
 209     elif benchmark_ext == '.p':
 210         status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
 211         status_to_output = lambda s: '% SZS status {} for {}'.format(
 212             s, benchmark_filename)
 213     elif benchmark_ext == '.sy':
 214         comment_char = ';'
 215         # Do not check proofs/unsat-cores with .sy files
 216         check_unsat_cores = False
 217         check_proofs = False
 218     else:
 219         sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
 220             benchmark_basename))
 221
 222     benchmark_lines = None
 223     with open(benchmark_path, 'r') as benchmark_file:
 224         benchmark_lines = benchmark_file.readlines()
 225     benchmark_content = ''.join(benchmark_lines)
 226
 227     # Extract the metadata for the benchmark.
 228     scrubber = None
 229     error_scrubber = None
 230     expected_output = ''
 231     expected_error = ''
 232     expected_exit_status = None
 233     command_lines = []
 234     requires = []
 235     for line in benchmark_lines:
 236         # Skip lines that do not start with a comment character.
 237         if line[0] != comment_char:
 238             continue
 239         line = line[1:].lstrip()
 240
 241         if line.startswith(SCRUBBER):
 242             scrubber = line[len(SCRUBBER):].strip()
 243         elif line.startswith(ERROR_SCRUBBER):
 244             error_scrubber = line[len(ERROR_SCRUBBER):].strip()
 245         elif line.startswith(EXPECT):
 246             expected_output += line[len(EXPECT):].strip() + '\n'
 247         elif line.startswith(EXPECT_ERROR):
 248             expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
 249         elif line.startswith(EXIT):
 250             expected_exit_status = int(line[len(EXIT):].strip())
 251         elif line.startswith(COMMAND_LINE):
 252             command_lines.append(line[len(COMMAND_LINE):].strip())
 253         elif line.startswith(REQUIRES):
 254             requires.append(line[len(REQUIRES):].strip())
 255     expected_output = expected_output.strip()
 256     expected_error = expected_error.strip()
 257
 258     # Expected output/expected error has not been defined in the metadata for
 259     # the benchmark. Try to extract the information from the benchmark itself.
 260     if expected_output == '' and expected_error == '':
 261         match = None
 262         if status_regex:
 263             match = re.findall(status_regex, benchmark_content)
 264
 265         if match:
 266             expected_output = status_to_output('\n'.join(match))
 267         elif expected_exit_status is None:
 268             # If there is no expected output/error and the exit status has not
 269             # been set explicitly, the benchmark is invalid.
 270             sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
 271     if expected_exit_status is None:
 272         expected_exit_status = 0
 273
 274     if 'CVC5_REGRESSION_ARGS' in os.environ:
 275         basic_command_line_args += shlex.split(
 276             os.environ['CVC5_REGRESSION_ARGS'])
 277
 278     if not check_unsat_cores and ('(get-unsat-core)' in benchmark_content
 279                             or '(get-unsat-assumptions)' in benchmark_content):
 280         print(
 281             '1..0 # Skipped regression: unsat cores not supported without proof support'
 282         )
 283         return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 284
 285     for req_feature in requires:
 286         is_negative = False
 287         if req_feature.startswith("no-"):
 288             req_feature = req_feature[len("no-"):]
 289             is_negative = True
 290         if req_feature not in (cvc4_features + cvc4_disabled_features):
 291             print(
 292                 'Illegal requirement in regression: {}\nAllowed requirements: {}'
 293                 .format(req_feature,
 294                         ' '.join(cvc4_features + cvc4_disabled_features)))
 295             return EXIT_FAILURE
 296         if is_negative:
 297             if req_feature in cvc4_features:
 298                 print('1..0 # Skipped regression: not valid with {}'.format(
 299                     req_feature))
 300                 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 301         elif req_feature not in cvc4_features:
 302             print('1..0 # Skipped regression: {} not supported'.format(
 303                 req_feature))
 304             return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
 305
 306     if not command_lines:
 307         command_lines.append('')
 308
 309     command_line_args_configs = []
 310     for command_line in command_lines:
 311         args = shlex.split(command_line)
 312         all_args = basic_command_line_args + args
 313
 314         if not check_unsat_cores and ('--check-unsat-cores' in all_args):
 315             print(
 316                 '# Skipped command line options ({}): unsat cores not supported without proof support'
 317                 .format(all_args))
 318             continue
 319         if not check_proofs and '--dump-proofs' in all_args:
 320             print(
 321                 '# Skipped command line options ({}): proof production not supported'
 322                 .format(all_args))
 323             continue
 324
 325         command_line_args_configs.append(all_args)
 326
 327         expected_output_lines = expected_output.split()
 328         extra_command_line_args = []
 329         if benchmark_ext == '.sy' and \
 330             '--no-check-synth-sol' not in all_args and \
 331             '--sygus-rr' not in all_args and \
 332             '--check-synth-sol' not in all_args:
 333             all_args += ['--check-synth-sol']
 334         if ('sat' in expected_output_lines or \
 335             'not_entailed' in expected_output_lines or \
 336             'unknown' in expected_output_lines) and \
 337            '--no-debug-check-models' not in all_args and \
 338            '--no-check-models' not in all_args and \
 339            '--debug-check-models' not in all_args:
 340             extra_command_line_args += ['--debug-check-models']
 341         if 'unsat' in expected_output_lines or 'entailed' in expected_output_lines:
 342             if check_unsat_cores and \
 343                '--no-produce-unsat-cores' not in all_args and \
 344                '--no-check-unsat-cores' not in all_args and \
 345                '--check-unsat-cores' not in all_args and \
 346                '--incremental' not in all_args and \
 347                '--unconstrained-simp' not in all_args:
 348                 extra_command_line_args += ['--check-unsat-cores']
 349             if check_proofs and \
 350                '--no-produce-proofs' not in all_args and \
 351                '--no-check-proofs' not in all_args and \
 352                '--check-proofs' not in all_args:
 353                 extra_command_line_args += ['--check-proofs']
 354         if '--no-check-abducts' not in all_args and \
 355             '--check-abducts' not in all_args and \
 356             'get-abduct' in benchmark_content:
 357             all_args += ['--check-abducts']
 358
 359         # Create a test case for each extra argument
 360         for extra_arg in extra_command_line_args:
 361             command_line_args_configs.append(all_args + [extra_arg])
 362
 363     # Run CVC4 on the benchmark with the different option sets and check
 364     # whether the exit status, stdout output, stderr output are as expected.
 365     print('1..{}'.format(len(command_line_args_configs)))
 366     print('# Starting')
 367     exit_code = EXIT_OK
 368     for command_line_args in command_line_args_configs:
 369         output, error, exit_status = run_benchmark(dump, wrapper, scrubber,
 370                                                    error_scrubber, cvc4_binary,
 371                                                    command_line_args,
 372                                                    benchmark_dir,
 373                                                    benchmark_basename, timeout)
 374         output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE)
 375         error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE)
 376         if exit_status == STATUS_TIMEOUT:
 377             exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE
 378             print('Timeout - Flags: {}'.format(command_line_args))
 379         elif output != expected_output:
 380             exit_code = EXIT_FAILURE
 381             print('not ok - Flags: {}'.format(command_line_args))
 382             print()
 383             print('Standard output difference')
 384             print('=' * 80)
 385             print_diff(output, expected_output)
 386             print('=' * 80)
 387             print()
 388             print()
 389             if error:
 390                 print('Error output')
 391                 print('=' * 80)
 392                 print_colored(Color.YELLOW, error)
 393                 print('=' * 80)
 394                 print()
 395         elif error != expected_error:
 396             exit_code = EXIT_FAILURE
 397             print(
 398                 'not ok - Differences between expected and actual output on stderr - Flags: {}'
 399                 .format(command_line_args))
 400             print()
 401             print('Error output difference')
 402             print('=' * 80)
 403             print_diff(error, expected_error)
 404             print('=' * 80)
 405             print()
 406         elif expected_exit_status != exit_status:
 407             exit_code = EXIT_FAILURE
 408             print(
 409                 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
 410                 format(expected_exit_status, exit_status, command_line_args))
 411             print()
 412             print('Output:')
 413             print('=' * 80)
 414             print_colored(Color.BLUE, output)
 415             print('=' * 80)
 416             print()
 417             print()
 418             print('Error output:')
 419             print('=' * 80)
 420             print_colored(Color.YELLOW, error)
 421             print('=' * 80)
 422             print()
 423         else:
 424             print('ok - Flags: {}'.format(command_line_args))
 425
 426     return exit_code
 427
 428
 429 def main():
 430     """Parses the command line arguments and then calls the core of the
 431     script."""
 432
 433     parser = argparse.ArgumentParser(
 434         description=
 435         'Runs benchmark and checks for correct exit status and output.')
 436     parser.add_argument('--dump', action='store_true')
 437     parser.add_argument('--use-skip-return-code', action='store_true')
 438     parser.add_argument('--skip-timeout', action='store_true')
 439     parser.add_argument('--check-unsat-cores', action='store_true',
 440                         default=True)
 441     parser.add_argument('--no-check-unsat-cores', dest='check_unsat_cores',
 442                         action='store_false')
 443     parser.add_argument('--check-proofs', action='store_true', default=True)
 444     parser.add_argument('--no-check-proofs', dest='check_proofs',
 445                         action='store_false')
 446     parser.add_argument('wrapper', nargs='*')
 447     parser.add_argument('cvc4_binary')
 448     parser.add_argument('benchmark')
 449
 450     argv = sys.argv[1:]
 451     # Append options passed via RUN_REGRESSION_ARGS to argv
 452     if os.environ.get('RUN_REGRESSION_ARGS'):
 453         argv.extend(shlex.split(os.getenv('RUN_REGRESSION_ARGS')))
 454
 455     args = parser.parse_args(argv)
 456
 457     cvc4_binary = os.path.abspath(args.cvc4_binary)
 458
 459     wrapper = args.wrapper
 460     if os.environ.get('VALGRIND') == '1' and not wrapper:
 461         wrapper = ['libtool', '--mode=execute', 'valgrind']
 462
 463     timeout = float(os.getenv('TEST_TIMEOUT', '600'))
 464
 465     return run_regression(args.check_unsat_cores, args.check_proofs, args.dump,
 466                           args.use_skip_return_code, args.skip_timeout,
 467                           wrapper, cvc4_binary, args.benchmark, timeout)
 468
 469
 470 if __name__ == "__main__":
 471     exit_code = main()
 472     sys.exit(exit_code)