[Reg] Make status/unsat-core detection more robust (#1775)
[cvc5.git] / test / regress / run_regression.py
1 #!/usr/bin/env python3
2 """
3 Usage:
4
5 run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary
6 [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]
7
8 Runs benchmark and checks for correct exit status and output.
9 """
10
11 import argparse
12 import difflib
13 import os
14 import re
15 import shlex
16 import subprocess
17 import sys
18
19 SCRUBBER = 'SCRUBBER: '
20 ERROR_SCRUBBER = 'ERROR-SCRUBBER: '
21 EXPECT = 'EXPECT: '
22 EXPECT_ERROR = 'EXPECT-ERROR: '
23 EXIT = 'EXIT: '
24 COMMAND_LINE = 'COMMAND-LINE: '
25
26
27 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
28 command_line, benchmark_dir, benchmark_filename):
29 """Runs CVC4 on the file `benchmark_filename` in the directory
30 `benchmark_dir` using the binary `cvc4_binary` with the command line
31 options `command_line`. The output is scrubbed using `scrubber` and
32 `error_scrubber` for stdout and stderr, respectively. If dump is true, the
33 function first uses CVC4 to read in and dump the benchmark file and then
34 uses that as input."""
35
36 bin_args = wrapper[:]
37 bin_args.append(cvc4_binary)
38
39 output = None
40 error = None
41 exit_status = None
42 if dump:
43 dump_args = [
44 '--preprocess-only', '--dump', 'raw-benchmark',
45 '--output-lang=smt2', '-qq'
46 ]
47 dump_process = subprocess.Popen(
48 bin_args + command_line + dump_args + [benchmark_filename],
49 cwd=benchmark_dir,
50 stdout=subprocess.PIPE,
51 stderr=subprocess.PIPE)
52 dump_output, _ = dump_process.communicate()
53 process = subprocess.Popen(
54 bin_args + command_line + ['--lang=smt2', '-'],
55 cwd=benchmark_dir,
56 stdin=subprocess.PIPE,
57 stdout=subprocess.PIPE,
58 stderr=subprocess.PIPE)
59 output, error = process.communicate(input=dump_output)
60 exit_status = process.returncode
61 else:
62 process = subprocess.Popen(
63 bin_args + command_line + [benchmark_filename],
64 cwd=benchmark_dir,
65 stdout=subprocess.PIPE,
66 stderr=subprocess.PIPE)
67 output, error = process.communicate()
68 exit_status = process.returncode
69
70 # If a scrubber command has been specified then apply it to the output.
71 if scrubber:
72 scrubber_process = subprocess.Popen(
73 shlex.split(scrubber),
74 stdin=subprocess.PIPE,
75 stdout=subprocess.PIPE,
76 stderr=subprocess.PIPE)
77 output, _ = scrubber_process.communicate(input=output)
78 if error_scrubber:
79 error_scrubber_process = subprocess.Popen(
80 shlex.split(error_scrubber),
81 stdin=subprocess.PIPE,
82 stdout=subprocess.PIPE,
83 stderr=subprocess.PIPE)
84 error, _ = error_scrubber_process.communicate(input=error)
85
86 # Popen in Python 3 returns a bytes object instead of a string for
87 # stdout/stderr.
88 if isinstance(output, bytes):
89 output = output.decode()
90 if isinstance(error, bytes):
91 error = error.decode()
92 return (output.strip(), error.strip(), exit_status)
93
94
95 def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path):
96 """Determines the expected output for a benchmark, runs CVC4 on it and then
97 checks whether the output corresponds to the expected output. Optionally
98 uses a wrapper `wrapper`, tests proof generation (if proof is true), or
99 dumps a benchmark and uses that as the input (if dump is true)."""
100
101 if not os.access(cvc4_binary, os.X_OK):
102 sys.exit(
103 '"{}" does not exist or is not executable'.format(cvc4_binary))
104 if not os.path.isfile(benchmark_path):
105 sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
106
107 basic_command_line_args = []
108
109 benchmark_basename = os.path.basename(benchmark_path)
110 benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
111 benchmark_dir = os.path.dirname(benchmark_path)
112 comment_char = '%'
113 status_regex = None
114 status_to_output = lambda s: s
115 if benchmark_ext == '.smt':
116 status_regex = r':status\s*(sat|unsat)'
117 comment_char = ';'
118 elif benchmark_ext == '.smt2':
119 status_regex = r'set-info\s*:status\s*(sat|unsat)'
120 comment_char = ';'
121 elif benchmark_ext == '.cvc':
122 pass
123 elif benchmark_ext == '.p':
124 basic_command_line_args.append('--finite-model-find')
125 status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
126 status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
127 elif benchmark_ext == '.sy':
128 comment_char = ';'
129 # Do not use proofs/unsat-cores with .sy files
130 proof = False
131 else:
132 sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
133 benchmark_basename))
134
135 # If there is an ".expect" file for the benchmark, read the metadata
136 # from there, otherwise from the benchmark file.
137 metadata_filename = benchmark_path + '.expect'
138 if os.path.isfile(metadata_filename):
139 comment_char = '%'
140 else:
141 metadata_filename = benchmark_path
142
143 metadata_lines = None
144 with open(metadata_filename, 'r') as metadata_file:
145 metadata_lines = metadata_file.readlines()
146
147 benchmark_content = None
148 if metadata_filename == benchmark_path:
149 benchmark_content = ''.join(metadata_lines)
150 else:
151 with open(benchmark_path, 'r') as benchmark_file:
152 benchmark_content = benchmark_file.read()
153
154 # Extract the metadata for the benchmark.
155 scrubber = None
156 error_scrubber = None
157 expected_output = ''
158 expected_error = ''
159 expected_exit_status = None
160 command_line = ''
161 for line in metadata_lines:
162 # Skip lines that do not start with "%"
163 if line[0] != comment_char:
164 continue
165 line = line[2:]
166
167 if line.startswith(SCRUBBER):
168 scrubber = line[len(SCRUBBER):]
169 elif line.startswith(ERROR_SCRUBBER):
170 error_scrubber = line[len(ERROR_SCRUBBER):]
171 elif line.startswith(EXPECT):
172 expected_output += line[len(EXPECT):]
173 elif line.startswith(EXPECT_ERROR):
174 expected_error += line[len(EXPECT_ERROR):]
175 elif line.startswith(EXIT):
176 expected_exit_status = int(line[len(EXIT):])
177 elif line.startswith(COMMAND_LINE):
178 command_line += line[len(COMMAND_LINE):]
179 expected_output = expected_output.strip()
180 expected_error = expected_error.strip()
181
182 # Expected output/expected error has not been defined in the metadata for
183 # the benchmark. Try to extract the information from the benchmark itself.
184 if expected_output == '' and expected_error == '':
185 match = None
186 if status_regex:
187 match = re.search(status_regex, benchmark_content)
188
189 if match:
190 expected_output = status_to_output(match.group(1))
191 elif expected_exit_status is None:
192 # If there is no expected output/error and the exit status has not
193 # been set explicitly, the benchmark is invalid.
194 sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
195
196 if not proof and ('(get-unsat-core)' in benchmark_content
197 or '(get-unsat-assumptions)' in benchmark_content):
198 print(
199 '1..0 # Skipped: unsat cores not supported without proof support')
200 return
201
202 if expected_exit_status is None:
203 expected_exit_status = 0
204
205 if 'CVC4_REGRESSION_ARGS' in os.environ:
206 basic_command_line_args += shlex.split(
207 os.environ['CVC4_REGRESSION_ARGS'])
208 basic_command_line_args += shlex.split(command_line)
209 command_line_args_configs = [basic_command_line_args]
210
211 extra_command_line_args = []
212 if benchmark_ext == '.sy' and \
213 '--no-check-synth-sol' not in basic_command_line_args and \
214 '--check-synth-sol' not in basic_command_line_args:
215 extra_command_line_args = ['--check-synth-sol']
216 if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
217 '--no-check-models' not in basic_command_line_args:
218 extra_command_line_args = ['--check-models']
219 if proof and re.search(r'^(unsat|valid)$', expected_output):
220 if '--no-check-proofs' not in basic_command_line_args and \
221 '--incremental' not in basic_command_line_args and \
222 '--unconstrained-simp' not in basic_command_line_args and \
223 not cvc4_binary.endswith('pcvc4'):
224 extra_command_line_args = [
225 '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
226 '--no-bv-algebraic'
227 ]
228 if '--no-check-unsat-cores' not in basic_command_line_args and \
229 '--incremental' not in basic_command_line_args and \
230 '--unconstrained-simp' not in basic_command_line_args and \
231 not cvc4_binary.endswith('pcvc4'):
232 extra_command_line_args += ['--check-unsat-cores']
233 if extra_command_line_args:
234 command_line_args_configs.append(
235 basic_command_line_args + extra_command_line_args)
236
237 # Run CVC4 on the benchmark with the different option sets and check
238 # whether the exit status, stdout output, stderr output are as expected.
239 print('1..{}'.format(len(command_line_args_configs)))
240 print('# Starting')
241 for command_line_args in command_line_args_configs:
242 output, error, exit_status = run_benchmark(
243 dump, wrapper, scrubber, error_scrubber, cvc4_binary,
244 command_line_args, benchmark_dir, benchmark_basename)
245 if output != expected_output:
246 print(
247 'not ok - Differences between expected and actual output on stdout - Flags: {}'.
248 format(command_line_args))
249 for line in difflib.context_diff(output.splitlines(),
250 expected_output.splitlines()):
251 print(line)
252 elif error != expected_error:
253 print(
254 'not ok - Differences between expected and actual output on stderr - Flags: {}'.
255 format(command_line_args))
256 for line in difflib.context_diff(error.splitlines(),
257 expected_error.splitlines()):
258 print(line)
259 elif expected_exit_status != exit_status:
260 print(
261 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
262 format(expected_exit_status, exit_status, command_line_args))
263 else:
264 print('ok - Flags: {}'.format(command_line_args))
265
266
267 def main():
268 """Parses the command line arguments and then calls the core of the
269 script."""
270
271 parser = argparse.ArgumentParser(
272 description=
273 'Runs benchmark and checks for correct exit status and output.')
274 parser.add_argument('--proof', action='store_true')
275 parser.add_argument('--dump', action='store_true')
276 parser.add_argument('wrapper', nargs='*')
277 parser.add_argument('cvc4_binary')
278 parser.add_argument('benchmark')
279 args = parser.parse_args()
280 cvc4_binary = os.path.abspath(args.cvc4_binary)
281
282 wrapper = args.wrapper
283 if os.environ.get('VALGRIND') == '1' and not wrapper:
284 wrapper = ['libtool', '--mode=execute', 'valgrind']
285
286 run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark)
287
288
289 if __name__ == "__main__":
290 main()