From efc6163629c6c5de446eccfe81777c93829995d5 Mon Sep 17 00:00:00 2001 From: Andres Noetzli Date: Thu, 5 Apr 2018 21:45:17 -0700 Subject: [PATCH] Python regression script (#1662) Until now, we have been using a bash script for the running the regressions. That script had several issues, e.g. it was creating many temprorary files and it was calling itself recursively, making it difficult to maintain. This commit replaces the script with a Python script. The Python script uses the TAP protocol, which allows us to display the results of multiple tests per file (e.g. with --check-models and without) separately and with more details. It also outputs whenever it starts running a test, which allows finding "stuck" regression tests on Travis. As recommended in the automake documentation [0], the commit copies the TAP driver to enable TAP support for the test driver. Note: the commit also changes some of the regressions slightly because we were using "%" for the test directives in SMT files which does not correspond to the comment character ";". Using the comment character simplifies the handling and makes it easier for users to reproduce a failure (they can simply run CVC4 on the regression file). [0] https://www.gnu.org/software/automake/manual/html_node/Use-TAP-with-the-Automake-test-harness.html --- config/tap-driver.sh | 651 ++++++++++++++++++ configure.ac | 1 + test/regress/Makefile.am | 8 +- test/regress/regress0/arrays/incorrect10.smt | 4 +- test/regress/regress0/expect/scrub.02.smt | 10 +- .../fmf/Arrow_Order-smtlib.778341.smt | 4 +- .../regress/regress0/fmf/QEpres-uf.855035.smt | 4 +- test/regress/regress1/fmf/Hoare-z3.931718.smt | 4 +- test/regress/run_regression | 424 ------------ test/regress/run_regression.py | 287 ++++++++ 10 files changed, 957 insertions(+), 440 deletions(-) create mode 100755 config/tap-driver.sh delete mode 100755 test/regress/run_regression create mode 100755 test/regress/run_regression.py diff --git a/config/tap-driver.sh b/config/tap-driver.sh new file mode 100755 index 000000000..d3e68bdc1 --- /dev/null +++ b/config/tap-driver.sh @@ -0,0 +1,651 @@ +#! /bin/sh +# Copyright (C) 2011-2018 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# This file is maintained in Automake, please report +# bugs to or send patches to +# . + +scriptversion=2013-12-23.17; # UTC + +# Make unconditional expansion of undefined variables an error. This +# helps a lot in preventing typo-related bugs. +set -u + +me=tap-driver.sh + +fatal () +{ + echo "$me: fatal: $*" >&2 + exit 1 +} + +usage_error () +{ + echo "$me: $*" >&2 + print_usage >&2 + exit 2 +} + +print_usage () +{ + cat < + # + trap : 1 3 2 13 15 + if test $merge -gt 0; then + exec 2>&1 + else + exec 2>&3 + fi + "$@" + echo $? + ) | LC_ALL=C ${AM_TAP_AWK-awk} \ + -v me="$me" \ + -v test_script_name="$test_name" \ + -v log_file="$log_file" \ + -v trs_file="$trs_file" \ + -v expect_failure="$expect_failure" \ + -v merge="$merge" \ + -v ignore_exit="$ignore_exit" \ + -v comments="$comments" \ + -v diag_string="$diag_string" \ +' +# TODO: the usages of "cat >&3" below could be optimized when using +# GNU awk, and/on on systems that supports /dev/fd/. + +# Implementation note: in what follows, `result_obj` will be an +# associative array that (partly) simulates a TAP result object +# from the `TAP::Parser` perl module. + +## ----------- ## +## FUNCTIONS ## +## ----------- ## + +function fatal(msg) +{ + print me ": " msg | "cat >&2" + exit 1 +} + +function abort(where) +{ + fatal("internal error " where) +} + +# Convert a boolean to a "yes"/"no" string. +function yn(bool) +{ + return bool ? "yes" : "no"; +} + +function add_test_result(result) +{ + if (!test_results_index) + test_results_index = 0 + test_results_list[test_results_index] = result + test_results_index += 1 + test_results_seen[result] = 1; +} + +# Whether the test script should be re-run by "make recheck". +function must_recheck() +{ + for (k in test_results_seen) + if (k != "XFAIL" && k != "PASS" && k != "SKIP") + return 1 + return 0 +} + +# Whether the content of the log file associated to this test should +# be copied into the "global" test-suite.log. +function copy_in_global_log() +{ + for (k in test_results_seen) + if (k != "PASS") + return 1 + return 0 +} + +function get_global_test_result() +{ + if ("ERROR" in test_results_seen) + return "ERROR" + if ("FAIL" in test_results_seen || "XPASS" in test_results_seen) + return "FAIL" + all_skipped = 1 + for (k in test_results_seen) + if (k != "SKIP") + all_skipped = 0 + if (all_skipped) + return "SKIP" + return "PASS"; +} + +function stringify_result_obj(result_obj) +{ + if (result_obj["is_unplanned"] || result_obj["number"] != testno) + return "ERROR" + + if (plan_seen == LATE_PLAN) + return "ERROR" + + if (result_obj["directive"] == "TODO") + return result_obj["is_ok"] ? "XPASS" : "XFAIL" + + if (result_obj["directive"] == "SKIP") + return result_obj["is_ok"] ? "SKIP" : COOKED_FAIL; + + if (length(result_obj["directive"])) + abort("in function stringify_result_obj()") + + return result_obj["is_ok"] ? COOKED_PASS : COOKED_FAIL +} + +function decorate_result(result) +{ + color_name = color_for_result[result] + if (color_name) + return color_map[color_name] "" result "" color_map["std"] + # If we are not using colorized output, or if we do not know how + # to colorize the given result, we should return it unchanged. + return result +} + +function report(result, details) +{ + if (result ~ /^(X?(PASS|FAIL)|SKIP|ERROR)/) + { + msg = ": " test_script_name + add_test_result(result) + } + else if (result == "#") + { + msg = " " test_script_name ":" + } + else + { + abort("in function report()") + } + if (length(details)) + msg = msg " " details + # Output on console might be colorized. + print decorate_result(result) msg + # Log the result in the log file too, to help debugging (this is + # especially true when said result is a TAP error or "Bail out!"). + print result msg | "cat >&3"; +} + +function testsuite_error(error_message) +{ + report("ERROR", "- " error_message) +} + +function handle_tap_result() +{ + details = result_obj["number"]; + if (length(result_obj["description"])) + details = details " " result_obj["description"] + + if (plan_seen == LATE_PLAN) + { + details = details " # AFTER LATE PLAN"; + } + else if (result_obj["is_unplanned"]) + { + details = details " # UNPLANNED"; + } + else if (result_obj["number"] != testno) + { + details = sprintf("%s # OUT-OF-ORDER (expecting %d)", + details, testno); + } + else if (result_obj["directive"]) + { + details = details " # " result_obj["directive"]; + if (length(result_obj["explanation"])) + details = details " " result_obj["explanation"] + } + + report(stringify_result_obj(result_obj), details) +} + +# `skip_reason` should be empty whenever planned > 0. +function handle_tap_plan(planned, skip_reason) +{ + planned += 0 # Avoid getting confused if, say, `planned` is "00" + if (length(skip_reason) && planned > 0) + abort("in function handle_tap_plan()") + if (plan_seen) + { + # Error, only one plan per stream is acceptable. + testsuite_error("multiple test plans") + return; + } + planned_tests = planned + # The TAP plan can come before or after *all* the TAP results; we speak + # respectively of an "early" or a "late" plan. If we see the plan line + # after at least one TAP result has been seen, assume we have a late + # plan; in this case, any further test result seen after the plan will + # be flagged as an error. + plan_seen = (testno >= 1 ? LATE_PLAN : EARLY_PLAN) + # If testno > 0, we have an error ("too many tests run") that will be + # automatically dealt with later, so do not worry about it here. If + # $plan_seen is true, we have an error due to a repeated plan, and that + # has already been dealt with above. Otherwise, we have a valid "plan + # with SKIP" specification, and should report it as a particular kind + # of SKIP result. + if (planned == 0 && testno == 0) + { + if (length(skip_reason)) + skip_reason = "- " skip_reason; + report("SKIP", skip_reason); + } +} + +function extract_tap_comment(line) +{ + if (index(line, diag_string) == 1) + { + # Strip leading `diag_string` from `line`. + line = substr(line, length(diag_string) + 1) + # And strip any leading and trailing whitespace left. + sub("^[ \t]*", "", line) + sub("[ \t]*$", "", line) + # Return what is left (if any). + return line; + } + return ""; +} + +# When this function is called, we know that line is a TAP result line, +# so that it matches the (perl) RE "^(not )?ok\b". +function setup_result_obj(line) +{ + # Get the result, and remove it from the line. + result_obj["is_ok"] = (substr(line, 1, 2) == "ok" ? 1 : 0) + sub("^(not )?ok[ \t]*", "", line) + + # If the result has an explicit number, get it and strip it; otherwise, + # automatically assing the next progresive number to it. + if (line ~ /^[0-9]+$/ || line ~ /^[0-9]+[^a-zA-Z0-9_]/) + { + match(line, "^[0-9]+") + # The final `+ 0` is to normalize numbers with leading zeros. + result_obj["number"] = substr(line, 1, RLENGTH) + 0 + line = substr(line, RLENGTH + 1) + } + else + { + result_obj["number"] = testno + } + + if (plan_seen == LATE_PLAN) + # No further test results are acceptable after a "late" TAP plan + # has been seen. + result_obj["is_unplanned"] = 1 + else if (plan_seen && testno > planned_tests) + result_obj["is_unplanned"] = 1 + else + result_obj["is_unplanned"] = 0 + + # Strip trailing and leading whitespace. + sub("^[ \t]*", "", line) + sub("[ \t]*$", "", line) + + # This will have to be corrected if we have a "TODO"/"SKIP" directive. + result_obj["description"] = line + result_obj["directive"] = "" + result_obj["explanation"] = "" + + if (index(line, "#") == 0) + return # No possible directive, nothing more to do. + + # Directives are case-insensitive. + rx = "[ \t]*#[ \t]*([tT][oO][dD][oO]|[sS][kK][iI][pP])[ \t]*" + + # See whether we have the directive, and if yes, where. + pos = match(line, rx "$") + if (!pos) + pos = match(line, rx "[^a-zA-Z0-9_]") + + # If there was no TAP directive, we have nothing more to do. + if (!pos) + return + + # Let`s now see if the TAP directive has been escaped. For example: + # escaped: ok \# SKIP + # not escaped: ok \\# SKIP + # escaped: ok \\\\\# SKIP + # not escaped: ok \ # SKIP + if (substr(line, pos, 1) == "#") + { + bslash_count = 0 + for (i = pos; i > 1 && substr(line, i - 1, 1) == "\\"; i--) + bslash_count += 1 + if (bslash_count % 2) + return # Directive was escaped. + } + + # Strip the directive and its explanation (if any) from the test + # description. + result_obj["description"] = substr(line, 1, pos - 1) + # Now remove the test description from the line, that has been dealt + # with already. + line = substr(line, pos) + # Strip the directive, and save its value (normalized to upper case). + sub("^[ \t]*#[ \t]*", "", line) + result_obj["directive"] = toupper(substr(line, 1, 4)) + line = substr(line, 5) + # Now get the explanation for the directive (if any), with leading + # and trailing whitespace removed. + sub("^[ \t]*", "", line) + sub("[ \t]*$", "", line) + result_obj["explanation"] = line +} + +function get_test_exit_message(status) +{ + if (status == 0) + return "" + if (status !~ /^[1-9][0-9]*$/) + abort("getting exit status") + if (status < 127) + exit_details = "" + else if (status == 127) + exit_details = " (command not found?)" + else if (status >= 128 && status <= 255) + exit_details = sprintf(" (terminated by signal %d?)", status - 128) + else if (status > 256 && status <= 384) + # We used to report an "abnormal termination" here, but some Korn + # shells, when a child process die due to signal number n, can leave + # in $? an exit status of 256+n instead of the more standard 128+n. + # Apparently, both behaviours are allowed by POSIX (2008), so be + # prepared to handle them both. See also Austing Group report ID + # 0000051 + exit_details = sprintf(" (terminated by signal %d?)", status - 256) + else + # Never seen in practice. + exit_details = " (abnormal termination)" + return sprintf("exited with status %d%s", status, exit_details) +} + +function write_test_results() +{ + print ":global-test-result: " get_global_test_result() > trs_file + print ":recheck: " yn(must_recheck()) > trs_file + print ":copy-in-global-log: " yn(copy_in_global_log()) > trs_file + for (i = 0; i < test_results_index; i += 1) + print ":test-result: " test_results_list[i] > trs_file + close(trs_file); +} + +BEGIN { + +## ------- ## +## SETUP ## +## ------- ## + +'"$init_colors"' + +# Properly initialized once the TAP plan is seen. +planned_tests = 0 + +COOKED_PASS = expect_failure ? "XPASS": "PASS"; +COOKED_FAIL = expect_failure ? "XFAIL": "FAIL"; + +# Enumeration-like constants to remember which kind of plan (if any) +# has been seen. It is important that NO_PLAN evaluates "false" as +# a boolean. +NO_PLAN = 0 +EARLY_PLAN = 1 +LATE_PLAN = 2 + +testno = 0 # Number of test results seen so far. +bailed_out = 0 # Whether a "Bail out!" directive has been seen. + +# Whether the TAP plan has been seen or not, and if yes, which kind +# it is ("early" is seen before any test result, "late" otherwise). +plan_seen = NO_PLAN + +## --------- ## +## PARSING ## +## --------- ## + +is_first_read = 1 + +while (1) + { + # Involutions required so that we are able to read the exit status + # from the last input line. + st = getline + if (st < 0) # I/O error. + fatal("I/O error while reading from input stream") + else if (st == 0) # End-of-input + { + if (is_first_read) + abort("in input loop: only one input line") + break + } + if (is_first_read) + { + is_first_read = 0 + nextline = $0 + continue + } + else + { + curline = nextline + nextline = $0 + $0 = curline + } + # Copy any input line verbatim into the log file. + print | "cat >&3" + # Parsing of TAP input should stop after a "Bail out!" directive. + if (bailed_out) + continue + + # TAP test result. + if ($0 ~ /^(not )?ok$/ || $0 ~ /^(not )?ok[^a-zA-Z0-9_]/) + { + testno += 1 + setup_result_obj($0) + handle_tap_result() + } + # TAP plan (normal or "SKIP" without explanation). + else if ($0 ~ /^1\.\.[0-9]+[ \t]*$/) + { + # The next two lines will put the number of planned tests in $0. + sub("^1\\.\\.", "") + sub("[^0-9]*$", "") + handle_tap_plan($0, "") + continue + } + # TAP "SKIP" plan, with an explanation. + else if ($0 ~ /^1\.\.0+[ \t]*#/) + { + # The next lines will put the skip explanation in $0, stripping + # any leading and trailing whitespace. This is a little more + # tricky in truth, since we want to also strip a potential leading + # "SKIP" string from the message. + sub("^[^#]*#[ \t]*(SKIP[: \t][ \t]*)?", "") + sub("[ \t]*$", ""); + handle_tap_plan(0, $0) + } + # "Bail out!" magic. + # Older versions of prove and TAP::Harness (e.g., 3.17) did not + # recognize a "Bail out!" directive when preceded by leading + # whitespace, but more modern versions (e.g., 3.23) do. So we + # emulate the latter, "more modern" behaviour. + else if ($0 ~ /^[ \t]*Bail out!/) + { + bailed_out = 1 + # Get the bailout message (if any), with leading and trailing + # whitespace stripped. The message remains stored in `$0`. + sub("^[ \t]*Bail out![ \t]*", ""); + sub("[ \t]*$", ""); + # Format the error message for the + bailout_message = "Bail out!" + if (length($0)) + bailout_message = bailout_message " " $0 + testsuite_error(bailout_message) + } + # Maybe we have too look for dianogtic comments too. + else if (comments != 0) + { + comment = extract_tap_comment($0); + if (length(comment)) + report("#", comment); + } + } + +## -------- ## +## FINISH ## +## -------- ## + +# A "Bail out!" directive should cause us to ignore any following TAP +# error, as well as a non-zero exit status from the TAP producer. +if (!bailed_out) + { + if (!plan_seen) + { + testsuite_error("missing test plan") + } + else if (planned_tests != testno) + { + bad_amount = testno > planned_tests ? "many" : "few" + testsuite_error(sprintf("too %s tests run (expected %d, got %d)", + bad_amount, planned_tests, testno)) + } + if (!ignore_exit) + { + # Fetch exit status from the last line. + exit_message = get_test_exit_message(nextline) + if (exit_message) + testsuite_error(exit_message) + } + } + +write_test_results() + +exit 0 + +} # End of "BEGIN" block. +' + +# TODO: document that we consume the file descriptor 3 :-( +} 3>"$log_file" + +test $? -eq 0 || fatal "I/O or internal error" + +# Local Variables: +# mode: shell-script +# sh-indentation: 2 +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC0" +# time-stamp-end: "; # UTC" +# End: diff --git a/configure.ac b/configure.ac index fb225b688..92053daf6 100644 --- a/configure.ac +++ b/configure.ac @@ -14,6 +14,7 @@ AC_PREREQ([2.61]) AC_INIT([cvc4], _CVC4_RELEASE_STRING, [cvc4-bugs@cs.stanford.edu]) AC_CONFIG_SRCDIR([src/include/cvc4_public.h]) AC_CONFIG_AUX_DIR([config]) +AC_REQUIRE_AUX_FILE([tap-driver.sh]) AC_CONFIG_MACRO_DIR([config]) AC_CONFIG_LIBOBJ_DIR([src/lib]) diff --git a/test/regress/Makefile.am b/test/regress/Makefile.am index f068ce9ad..19cbbae4a 100644 --- a/test/regress/Makefile.am +++ b/test/regress/Makefile.am @@ -9,7 +9,9 @@ TESTS = $(REG0_TESTS) $(REG1_TESTS) $(REG2_TESTS) $(REG3_TESTS) $(REG4_TESTS) @mk_empty@BINARY = cvc4 end@mk_if@ -LOG_COMPILER = $(top_srcdir)/test/regress/run_regression +LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ + $(top_srcdir)/config/tap-driver.sh --comments +LOG_COMPILER = @abs_top_srcdir@/test/regress/run_regression.py AM_LOG_FLAGS = $(RUN_REGRESSION_ARGS) @abs_top_builddir@/src/main/$(BINARY)$(EXEEXT) if AUTOMAKE_1_11 @@ -55,5 +57,5 @@ EXTRA_DIST = \ regress0/tptp/Axioms/SYN000_0.ax \ Makefile.levels \ Makefile.tests \ - run_regression \ - README.md + run_regression.py \ + README.md \ No newline at end of file diff --git a/test/regress/regress0/arrays/incorrect10.smt b/test/regress/regress0/arrays/incorrect10.smt index 6fbdb7ee9..6aacc37c0 100644 --- a/test/regress/regress0/arrays/incorrect10.smt +++ b/test/regress/regress0/arrays/incorrect10.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --no-check-proofs -% EXPECT: unsat +; COMMAND-LINE: --no-check-proofs +; EXPECT: unsat (benchmark fuzzsmt :logic QF_AX :status unsat diff --git a/test/regress/regress0/expect/scrub.02.smt b/test/regress/regress0/expect/scrub.02.smt index 145801619..65c61ecd0 100644 --- a/test/regress/regress0/expect/scrub.02.smt +++ b/test/regress/regress0/expect/scrub.02.smt @@ -1,8 +1,8 @@ -% SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/' -% EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic. -% EXPECT: The fact in question: TERM -% EXPECT: ") -% EXIT: 1 +; SCRUBBER: sed -e 's/The fact in question: .*$/The fact in question: TERM/' +; EXPECT: (error "A non-linear fact was asserted to arithmetic in a linear logic. +; EXPECT: The fact in question: TERM +; EXPECT: ") +; EXIT: 1 (benchmark reject_nonlinear :logic QF_LRA :extrafuns ((n Real)) diff --git a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt index f62f057e4..e8c7949dc 100644 --- a/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt +++ b/test/regress/regress0/fmf/Arrow_Order-smtlib.778341.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find --mbqi=gen-ev -% EXPECT: unsat +; COMMAND-LINE: --finite-model-find --mbqi=gen-ev +; EXPECT: unsat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/regress0/fmf/QEpres-uf.855035.smt b/test/regress/regress0/fmf/QEpres-uf.855035.smt index 2945c8f4d..4fe592638 100644 --- a/test/regress/regress0/fmf/QEpres-uf.855035.smt +++ b/test/regress/regress0/fmf/QEpres-uf.855035.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find --mbqi=gen-ev -% EXPECT: sat +; COMMAND-LINE: --finite-model-find --mbqi=gen-ev +; EXPECT: sat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/regress1/fmf/Hoare-z3.931718.smt b/test/regress/regress1/fmf/Hoare-z3.931718.smt index 754e19e88..4ab90756c 100644 --- a/test/regress/regress1/fmf/Hoare-z3.931718.smt +++ b/test/regress/regress1/fmf/Hoare-z3.931718.smt @@ -1,5 +1,5 @@ -% COMMAND-LINE: --finite-model-find -% EXPECT: sat +; COMMAND-LINE: --finite-model-find +; EXPECT: sat (benchmark Isabelle :status sat :logic AUFLIA diff --git a/test/regress/run_regression b/test/regress/run_regression deleted file mode 100755 index 4ae013911..000000000 --- a/test/regress/run_regression +++ /dev/null @@ -1,424 +0,0 @@ -#!/bin/bash -# -# Morgan Deters -# for the CVC4 project -# -# usage: -# -# run_regression [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ] -# -# Runs benchmark and checks for correct exit status and output. -# -# The TEST_GROUP and TEST_GROUPS environment variables can be used to split the -# benchmarks into groups and only run one of the groups. The benchmarks are -# associated with a group based on a hash of their name. TEST_GROUPS controls -# how many groups are created while TEST_GROUP specifies the group to run. -# Currently, the primary use case for this is to split the benchmarks across -# multiple Travis jobs. - -# ulimit -t 1 # For detecting long running regressions -ulimit -s 65000 # Needed for some (esp. portfolio and model-building) - -prog=`basename "$0"` - -if [ $# -lt 2 ]; then - echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2 - exit 1 -fi - -proof=no -dump=no -if [ x"$1" = x--proof ]; then - proof=yes - shift -elif [ x"$1" = x--dump ]; then - dump=yes - shift -fi - -if [ $# -lt 2 ]; then - echo "usage: $prog [ --proof | --dump ] [ wrapper ] cvc4-binary [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]" >&2 - exit 1 -fi - -wrapper= -while [ $# -gt 2 ]; do - wrapper="$wrapper$1 " - shift -done - -[[ "$VALGRIND" = "1" ]] && { - wrapper="libtool --mode=execute valgrind $wrapper" -} - -cvc4=$1 -benchmark_orig=$2 -benchmark="$benchmark_orig" - -if [ -n "$TEST_GROUP" ]; then - # Use the checksum of the benchmark name to determine the group of the - # benchmark - benchmark_group=$(( $(echo "$benchmark" | cksum | cut -d " " -f1) % $TEST_GROUPS )) - if (( $benchmark_group != $TEST_GROUP )); then - # Skip the benchmark if it is not in the expected test group - exit 77 - fi -fi - -function error { - echo "$prog: error: $*" - exit 1 -} - -if ! [ -x "$cvc4" ]; then - error "\`$cvc4' doesn't exist or isn't executable" >&2 -fi -if ! [ -r "$benchmark" ]; then - error "\`$benchmark' doesn't exist or isn't readable" >&2 -fi - -# gettemp() and its associated tempfiles[] array are intended to never -# allow a temporary file to leak---the trap ensures that when this script -# exits, whether via a regular exit or an -INT or other signal, the -# temp files are deleted. -declare -a tempfiles -trap -- 'test ${#tempfiles[@]} -gt 0 && rm -f "${tempfiles[@]}"' EXIT -function gettemp { - local temp="`mktemp -t "$2"`" - tempfiles[${#tempfiles[@]}]="$temp" - eval "$1"="$temp" -} - -tmpbenchmark= -if expr "$benchmark" : '.*\.smt$' &>/dev/null; then - lang=smt1 - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - elif grep '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` - # old mktemp from coreutils 7.x is broken, can't do XXXX in the middle - # this frustrates our auto-language-detection - gettemp tmpbenchmark cvc4_benchmark.smt.$$.XXXXXXXXXX - grep -v '^% \(PROOF\|EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" >"$tmpbenchmark" - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - benchmark=$tmpbenchmark - elif grep '^ *:status *sat' "$benchmark" &>/dev/null; then - expected_output=sat - expected_exit_status=0 - command_line= - elif grep '^ *:status *unsat' "$benchmark" &>/dev/null; then - expected_output=unsat - expected_exit_status=0 - command_line= - else - error "cannot determine status of \`$benchmark'" - fi -elif expr "$benchmark" : '.*\.smt2$' &>/dev/null; then - lang=smt2 - - # If this test case requires unsat cores but CVC4 is not built with proof - # support, skip it. Note: checking $CVC4_REGRESSION_ARGS instead of $proof - # here because $proof is not set for the second run. - requires_proof=`grep '(get-unsat-core)\|(get-unsat-assumptions)' "$benchmark"` - if [[ ! "$CVC4_REGRESSION_ARGS" = *"--proof"* ]] && [ -n "$requires_proof" ]; then - exit 77 - fi - - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'` - errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'` - expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'` - expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'` - command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'` - fi - - # If expected output/error was not given, try to determine the status from - # the commands. - if [ -z "$expected_output" -a -z "$expected_error" ]; then - if grep '^ *( *set-info *:status *sat' "$benchmark" &>/dev/null; then - expected_output=sat - elif grep '^ *( *set-info *:status *unsat' "$benchmark" &>/dev/null; then - expected_output=unsat - fi - fi - - # A valid test case needs at least an expected output (explicit or through - # SMT2 commands) or an expected exit status. - if [ -z "$expected_output" -a -z "$expected_error" -a -z "$expected_exit_status" ]; then - error "cannot determine status of \`$benchmark'" - fi - - # If no explicit expected exit status was given, we assume that the solver is - # supposed to succeed. - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi -elif expr "$benchmark" : '.*\.cvc$' &>/dev/null; then - lang=cvc4 - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=$(grep '^% EXPECT: ' "$benchmark") - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - if [ -z "$expected_output" -a -z "$expected_error" ]; then - error "cannot determine expected output of \`$benchmark': " \ - "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures" - fi - expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,') - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` -elif expr "$benchmark" : '.*\.p$' &>/dev/null; then - lang=tptp - command_line=--finite-model-find - scrubber=`grep '^% SCRUBBER: ' "$benchmark" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=$(grep '^% EXPECT: ' "$benchmark") - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark" | sed 's,^% EXPECT-ERROR: ,,'` - if [ -z "$expected_output" -a -z "$expected_error" ]; then - if grep -q '^% Status *: ' "$benchmark"; then - expected_output="$(grep '^% *Status *: ' "$benchmark" | head -1 | awk '{print$NF}')" - case "$expected_output" in - Theorem|Unsatisfiable) expected_exit_status=0 ;; - CounterSatisfiable|Satisfiable) expected_exit_status=0 ;; - GaveUp) expected_exit_status=0 ;; - esac - expected_output="% SZS status $expected_output for $(basename "$benchmark" | sed 's,\.p$,,')" - else - error "cannot determine expected output of \`$benchmark': " \ - "please use \`% EXPECT:' and/or \`% EXPECT-ERROR:' gestures" - fi - else - expected_output=$(echo "$expected_output" | perl -pe 's,^% EXPECT: ,,;s,\r,,') - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - fi - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - if grep -q '^% COMMAND-LINE: ' "$benchmark"; then - command_line=`grep '^% COMMAND-LINE: ' "$benchmark" | sed 's,^% COMMAND-LINE: ,,'` - fi -elif expr "$benchmark" : '.*\.sy$' &>/dev/null; then - lang=sygus - if test -e "$benchmark.expect"; then - scrubber=`grep '^% SCRUBBER: ' "$benchmark.expect" | sed 's,^% SCRUBBER: ,,'` - errscrubber=`grep '^% ERROR-SCRUBBER: ' "$benchmark.expect" | sed 's,^% ERROR-SCRUBBER: ,,'` - expected_output=`grep '^% EXPECT: ' "$benchmark.expect" | sed 's,^% EXPECT: ,,'` - expected_error=`grep '^% EXPECT-ERROR: ' "$benchmark.expect" | sed 's,^% EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^% EXIT: ' "$benchmark.expect" | perl -pe 's,^% EXIT: ,,;s,\r,,'` - command_line=`grep '^% COMMAND-LINE: ' "$benchmark.expect" | sed 's,^% COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - elif grep '^; \(EXPECT\|EXPECT-ERROR\|EXIT\|COMMAND-LINE\|SCRUBBER\|ERROR-SCRUBBER\): ' "$benchmark" "$benchmark" &>/dev/null; then - scrubber=`grep '^; SCRUBBER: ' "$benchmark" | sed 's,^; SCRUBBER: ,,'` - errscrubber=`grep '^; ERROR-SCRUBBER: ' "$benchmark" | sed 's,^; ERROR-SCRUBBER: ,,'` - expected_output=`grep '^; EXPECT: ' "$benchmark" | sed 's,^; EXPECT: ,,'` - expected_error=`grep '^; EXPECT-ERROR: ' "$benchmark" | sed 's,^; EXPECT-ERROR: ,,'` - expected_exit_status=`grep -m 1 '^; EXIT: ' "$benchmark" | perl -pe 's,^; EXIT: ,,;s,\r,,'` - command_line=`grep '^; COMMAND-LINE: ' "$benchmark" | sed 's,^; COMMAND-LINE: ,,'` - if [ -z "$expected_exit_status" ]; then - expected_exit_status=0 - fi - else - error "cannot determine expected output of \`$benchmark'" - fi -else - error "benchmark \`$benchmark' must be *.cvc or *.smt or *.smt2 or *.p or *.sy" -fi - -command_line="--lang=$lang ${command_line:+$command_line }" - -gettemp expoutfile cvc4_expect_stdout.$$.XXXXXXXXXX -gettemp experrfile cvc4_expect_stderr.$$.XXXXXXXXXX -gettemp outfile cvc4_stdout.$$.XXXXXXXXXX -gettemp errfile cvc4_stderr.$$.XXXXXXXXXX -gettemp errfilefix cvc4_stderr.$$.XXXXXXXXXX -gettemp exitstatusfile cvc4_exitstatus.$$.XXXXXXXXXX - -if [ -z "$expected_output" ]; then - # in case expected stdout output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$expoutfile" - touch "$expoutfile" -else - echo "$expected_output" >"$expoutfile" -fi - -if [ "$lang" = sygus ]; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-synth-sol' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-synth-sol' &>/dev/null; then - # later on, we'll run another test with --check-models on - command_line="$command_line --check-synth-sol" - fi -fi -check_models=false -if grep '^sat$' "$expoutfile" &>/dev/null || grep '^invalid$' "$expoutfile" &>/dev/null || grep '^unknown$' "$expoptfile" &>/dev/null; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-models' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-models' &>/dev/null; then - # later on, we'll run another test with --check-models on - check_models=true - fi -fi -check_proofs=false -check_unsat_cores=false -if [ "$proof" = yes ]; then - # proofs not currently supported in incremental mode, turn it off - if grep '^unsat$' "$expoutfile" &>/dev/null || grep '^valid$' "$expoutfile" &>/dev/null &>/dev/null; then - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-proofs' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-proofs' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null && - ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null && - ! expr "$BINARY" : '.*pcvc4' &>/dev/null && - ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then - # later on, we'll run another test with --check-proofs on - check_proofs=true - fi - if ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--check-unsat-cores' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--no-check-unsat-cores' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--incremental' &>/dev/null && - ! expr "$CVC4_REGRESSION_ARGS $command_line" : '.*--unconstrained-simp' &>/dev/null && - ! expr " $CVC4_REGRESSION_ARGS $command_line" : '.* -[a-zA-Z]*i' &>/dev/null && - ! expr "$BINARY" : '.*pcvc4' &>/dev/null && - ! expr "$benchmark" : '.*\.sy$' &>/dev/null; then - # later on, we'll run another test with --check-unsat-cores on - check_unsat_cores=true - fi - fi -fi - -if [ -z "$expected_error" ]; then - # in case expected stderr output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$experrfile" - touch "$experrfile" -else - echo "$expected_error" >"$experrfile" -fi - -cvc4dir=`dirname "$cvc4"` -cvc4dirfull=`cd "$cvc4dir" && pwd` -if [ -z "$cvc4dirfull" ]; then - error "getting directory of \`$cvc4 !?" -fi -cvc4base=`basename "$cvc4"` -cvc4full="$cvc4dirfull/$cvc4base" -if [ $dump = no ]; then - echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"` [from working dir `dirname "$benchmark"`] - time ( :; \ - ( cd `dirname "$benchmark"`; - $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line `basename "$benchmark"`; - echo $? >"$exitstatusfile" - ) > "$outfile" 2> "$errfile" ) -else - echo running $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` \| $wrapper $cvc4full $CVC4_REGRESSION_ARGS $command_line --lang=smt2 - [from working dir `dirname "$benchmark"`] - time ( :; \ - ( cd `dirname "$benchmark"`; - $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --preprocess-only --dump=clauses --output-lang=smt2 -q `basename "$benchmark"` | $wrapper "$cvc4full" $CVC4_REGRESSION_ARGS $command_line --lang=smt2 -; - echo $? >"$exitstatusfile" - ) > "$outfile" 2> "$errfile" ) -fi - -# we have to actual error file same treatment as other files. differences in -# versions of echo/bash were causing failure on some platforms and not on others -# (also grep out valgrind output, if 0 errors reported by valgrind) -actual_error=$(cat $errfile) -if [[ "$VALGRIND" = "1" ]]; then - #valgrind_output=$(cat $errfile|grep -E "^==[0-9]+== "|) - valgrind_num_errors=$(cat $errfile|grep -E "^==[0-9]+== "|tail -n1|awk '{print $4}') - echo "valgrind errors (not suppressed): $valgrind_num_errors" 1>&2 - - ((valgrind_num_errors == 0)) && actual_error=$(echo "$actual_error"|grep -vE "^==[0-9]+== ") -fi -if [ -z "$actual_error" ]; then - # in case expected stderr output is empty, make sure we don't differ - # by a newline, which we would if we echo "" >"$experrfile" - touch "$errfilefix" -else - echo "$actual_error" >"$errfilefix" -fi - -# Scrub the output file if a scrubber has been specified. -if [ -n "$scrubber" ]; then - echo "About to scrub $outfile using $scrubber" - mv "$outfile" "$outfile.prescrub" - cat "$outfile.prescrub" | eval $scrubber >"$outfile" -else - echo "not scrubbing" -fi - -# Scrub the error file if a scrubber has been specified. -if [ -n "$errscrubber" ]; then - echo "About to scrub $errfilefix using $errscrubber" - mv "$errfilefix" "$errfilefix.prescrub" - cat "$errfilefix.prescrub" | eval $errscrubber >"$errfilefix" -else - echo "not scrubbing error file" -fi - -diffs=`diff -u --strip-trailing-cr "$expoutfile" "$outfile"` -diffexit=$? -diffserr=`diff -u --strip-trailing-cr "$experrfile" "$errfilefix"` -diffexiterr=$? -exit_status=`cat "$exitstatusfile"` - -exitcode=0 -if [ $diffexit -ne 0 ]; then - echo "$prog: error: differences between expected and actual output on stdout" - echo "$diffs" - exitcode=1 -fi -if [ $diffexiterr -ne 0 ]; then - echo "$prog: error: differences between expected and actual output on stderr" - echo "$diffserr" - exitcode=1 -fi - -if [ "$exit_status" != "$expected_exit_status" ]; then - echo "$prog: error: expected exit status \`$expected_exit_status' but got \`$exit_status'" - exitcode=1 -fi - -if $check_models || $check_proofs || $check_unsat_cores; then - check_cmdline= - if $check_models; then - check_cmdline="$check_cmdline --check-models" - fi - if $check_proofs; then - # taking: Make the extra flags part of --check-proofs. - check_cmdline="$check_cmdline --check-proofs --no-bv-eq --no-bv-ineq --no-bv-algebraic" - fi - if $check_unsat_cores; then - check_cmdline="$check_cmdline --check-unsat-cores" - fi - # run an extra model/proof/core-checking pass - if ! CVC4_REGRESSION_ARGS="$CVC4_REGRESSION_ARGS$check_cmdline" "$0" $wrapper "$cvc4" "$benchmark_orig"; then - exitcode=1 - fi -fi - - -exit $exitcode diff --git a/test/regress/run_regression.py b/test/regress/run_regression.py new file mode 100755 index 000000000..c861b0d71 --- /dev/null +++ b/test/regress/run_regression.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +""" +Usage: + + run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary + [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ] + +Runs benchmark and checks for correct exit status and output. +""" + +import argparse +import difflib +import os +import re +import shlex +import subprocess +import sys + +SCRUBBER = 'SCRUBBER: ' +ERROR_SCRUBBER = 'ERROR-SCRUBBER: ' +EXPECT = 'EXPECT: ' +EXPECT_ERROR = 'EXPECT-ERROR: ' +EXIT = 'EXIT: ' +COMMAND_LINE = 'COMMAND-LINE: ' + + +def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line, benchmark_dir, benchmark_filename): + """Runs CVC4 on the file `benchmark_filename` in the directory + `benchmark_dir` using the binary `cvc4_binary` with the command line + options `command_line`. The output is scrubbed using `scrubber` and + `error_scrubber` for stdout and stderr, respectively. If dump is true, the + function first uses CVC4 to read in and dump the benchmark file and then + uses that as input.""" + + bin_args = wrapper[:] + bin_args.append(cvc4_binary) + + output = None + error = None + exit_status = None + if dump: + dump_args = [ + '--preprocess-only', '--dump', 'raw-benchmark', + '--output-lang=smt2', '-qq' + ] + dump_process = subprocess.Popen( + bin_args + command_line + dump_args + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + dump_output, _ = dump_process.communicate() + process = subprocess.Popen( + bin_args + command_line + ['--lang=smt2', '-'], + cwd=benchmark_dir, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate(input=dump_output) + exit_status = process.returncode + else: + process = subprocess.Popen( + bin_args + command_line + [benchmark_filename], + cwd=benchmark_dir, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, error = process.communicate() + exit_status = process.returncode + + # If a scrubber command has been specified then apply it to the output. + if scrubber: + scrubber_process = subprocess.Popen( + shlex.split(scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output, _ = scrubber_process.communicate(input=output) + if error_scrubber: + error_scrubber_process = subprocess.Popen( + shlex.split(error_scrubber), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + error, _ = error_scrubber_process.communicate(input=error) + + # Popen in Python 3 returns a bytes object instead of a string for + # stdout/stderr. + if isinstance(output, bytes): + output = output.decode() + if isinstance(error, bytes): + error = error.decode() + return (output.strip(), error.strip(), exit_status) + + +def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path): + """Determines the expected output for a benchmark, runs CVC4 on it and then + checks whether the output corresponds to the expected output. Optionally + uses a wrapper `wrapper`, tests proof generation (if proof is true), or + dumps a benchmark and uses that as the input (if dump is true).""" + + if not os.access(cvc4_binary, os.X_OK): + sys.exit( + '"{}" does not exist or is not executable'.format(cvc4_binary)) + if not os.path.isfile(benchmark_path): + sys.exit('"{}" does not exist or is not a file'.format(benchmark_path)) + + basic_command_line_args = [] + + benchmark_basename = os.path.basename(benchmark_path) + benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename) + benchmark_dir = os.path.dirname(benchmark_path) + comment_char = '%' + status_regex = None + status_to_output = lambda s: s + if benchmark_ext == '.smt': + status_regex = r':status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.smt2': + status_regex = r'set-info\s*:status\s*(sat|unsat)' + comment_char = ';' + elif benchmark_ext == '.cvc': + pass + elif benchmark_ext == '.p': + basic_command_line_args.append('--finite-model-find') + status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)' + status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename) + elif benchmark_ext == '.sy': + comment_char = ';' + # Do not use proofs/unsat-cores with .sy files + proof = False + else: + sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format( + benchmark_basename)) + + # If there is an ".expect" file for the benchmark, read the metadata + # from there, otherwise from the benchmark file. + metadata_filename = benchmark_path + '.expect' + if os.path.isfile(metadata_filename): + comment_char = '%' + else: + metadata_filename = benchmark_path + + metadata_lines = None + with open(metadata_filename, 'r') as metadata_file: + metadata_lines = metadata_file.readlines() + metadata_content = ''.join(metadata_lines) + + # Extract the metadata for the benchmark. + scrubber = None + error_scrubber = None + expected_output = '' + expected_error = '' + expected_exit_status = None + command_line = '' + for line in metadata_lines: + # Skip lines that do not start with "%" + if line[0] != comment_char: + continue + line = line[2:] + + if line.startswith(SCRUBBER): + scrubber = line[len(SCRUBBER):] + elif line.startswith(ERROR_SCRUBBER): + error_scrubber = line[len(ERROR_SCRUBBER):] + elif line.startswith(EXPECT): + expected_output += line[len(EXPECT):] + elif line.startswith(EXPECT_ERROR): + expected_error += line[len(EXPECT_ERROR):] + elif line.startswith(EXIT): + expected_exit_status = int(line[len(EXIT):]) + elif line.startswith(COMMAND_LINE): + command_line += line[len(COMMAND_LINE):] + expected_output = expected_output.strip() + expected_error = expected_error.strip() + + # Expected output/expected error has not been defined in the metadata for + # the benchmark. Try to extract the information from the benchmark itself. + if expected_output == '' and expected_error == '': + match = None + if status_regex: + match = re.search(status_regex, metadata_content) + + if match: + expected_output = status_to_output(match.group(1)) + elif expected_exit_status is None: + # If there is no expected output/error and the exit status has not + # been set explicitly, the benchmark is invalid. + sys.exit('Cannot determine status of "{}"'.format(benchmark_path)) + + if not proof and ('(get-unsat-cores)' in metadata_content + or '(get-unsat-assumptions)' in metadata_content): + print( + '1..0 # Skipped: unsat cores not supported without proof support') + return + + if expected_exit_status is None: + expected_exit_status = 0 + + if 'CVC4_REGRESSION_ARGS' in os.environ: + basic_command_line_args += shlex.split( + os.environ['CVC4_REGRESSION_ARGS']) + basic_command_line_args += shlex.split(command_line) + command_line_args_configs = [basic_command_line_args] + + extra_command_line_args = [] + if benchmark_ext == '.sy' and \ + '--no-check-synth-sol' not in basic_command_line_args and \ + '--check-synth-sol' not in basic_command_line_args: + extra_command_line_args = ['--check-synth-sol'] + if re.search(r'^(sat|invalid|unknown)$', expected_output) and \ + '--no-check-models' not in basic_command_line_args: + extra_command_line_args = ['--check-models'] + if proof and re.search(r'^(sat|valid)$', expected_output): + if '--no-check-proofs' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if '--no-check-unsat-cores' not in basic_command_line_args and \ + '--incremental' not in basic_command_line_args and \ + '--unconstrained-simp' not in basic_command_line_args and \ + not cvc4_binary.endswith('pcvc4'): + extra_command_line_args = [ + '--check-proofs', '--no-bv-eq', '--no-bv-ineq', + '--no-bv-algebraic' + ] + if extra_command_line_args: + command_line_args_configs.append( + basic_command_line_args + extra_command_line_args) + + # Run CVC4 on the benchmark with the different option sets and check + # whether the exit status, stdout output, stderr output are as expected. + print('1..{}'.format(len(command_line_args_configs))) + print('# Starting') + for command_line_args in command_line_args_configs: + output, error, exit_status = run_benchmark( + dump, wrapper, scrubber, error_scrubber, cvc4_binary, + command_line_args, benchmark_dir, benchmark_basename) + if output != expected_output: + print( + 'not ok - Differences between expected and actual output on stdout - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(output.splitlines(), + expected_output.splitlines()): + print(line) + elif error != expected_error: + print( + 'not ok - Differences between expected and actual output on stderr - Flags: {}'. + format(command_line_args)) + for line in difflib.context_diff(error.splitlines(), + expected_error.splitlines()): + print(line) + elif expected_exit_status != exit_status: + print( + 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'. + format(expected_exit_status, exit_status, command_line_args)) + else: + print('ok - Flags: {}'.format(command_line_args)) + + +def main(): + """Parses the command line arguments and then calls the core of the + script.""" + + parser = argparse.ArgumentParser( + description= + 'Runs benchmark and checks for correct exit status and output.') + parser.add_argument('--proof', action='store_true') + parser.add_argument('--dump', action='store_true') + parser.add_argument('wrapper', nargs='*') + parser.add_argument('cvc4_binary') + parser.add_argument('benchmark') + args = parser.parse_args() + cvc4_binary = os.path.abspath(args.cvc4_binary) + + wrapper = args.wrapper + if os.environ.get('VALGRIND') == '1' and not wrapper: + wrapper = ['libtool', '--mode=execute', 'valgrind'] + + run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark) + + +if __name__ == "__main__": + main() -- 2.30.2