Enforcing --no-bv-eq, --no-bv-algebraic and --no-bv-ineq when proofs are enabled...
[cvc5.git] / test / regress / run_regression.py
1 #!/usr/bin/env python3
2 """
3 Usage:
4
5 run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary
6 [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]
7
8 Runs benchmark and checks for correct exit status and output.
9 """
10
11 import argparse
12 import difflib
13 import os
14 import re
15 import shlex
16 import subprocess
17 import sys
18 import threading
19
20 SCRUBBER = 'SCRUBBER: '
21 ERROR_SCRUBBER = 'ERROR-SCRUBBER: '
22 EXPECT = 'EXPECT: '
23 EXPECT_ERROR = 'EXPECT-ERROR: '
24 EXIT = 'EXIT: '
25 COMMAND_LINE = 'COMMAND-LINE: '
26
27
28 def run_process(args, cwd, timeout, s_input=None):
29 """Runs a process with a timeout `timeout` in seconds. `args` are the
30 arguments to execute, `cwd` is the working directory and `s_input` is the
31 input to be sent to the process over stdin. Returns the output, the error
32 output and the exit code of the process. If the process times out, the
33 output and the error output are empty and the exit code is 124."""
34
35 proc = subprocess.Popen(
36 args,
37 cwd=cwd,
38 stdin=subprocess.PIPE,
39 stdout=subprocess.PIPE,
40 stderr=subprocess.PIPE)
41
42 out = ''
43 err = ''
44 exit_status = 124
45 timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
46 try:
47 timer.start()
48 out, err = proc.communicate(input=s_input)
49 exit_status = proc.returncode
50 finally:
51 timer.cancel()
52
53 return out, err, exit_status
54
55
56 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
57 command_line, benchmark_dir, benchmark_filename, timeout):
58 """Runs CVC4 on the file `benchmark_filename` in the directory
59 `benchmark_dir` using the binary `cvc4_binary` with the command line
60 options `command_line`. The output is scrubbed using `scrubber` and
61 `error_scrubber` for stdout and stderr, respectively. If dump is true, the
62 function first uses CVC4 to read in and dump the benchmark file and then
63 uses that as input."""
64
65 bin_args = wrapper[:]
66 bin_args.append(cvc4_binary)
67
68 output = None
69 error = None
70 exit_status = None
71 if dump:
72 dump_args = [
73 '--preprocess-only', '--dump', 'raw-benchmark',
74 '--output-lang=smt2', '-qq'
75 ]
76 dump_output, _, _ = run_process(
77 bin_args + command_line + dump_args + [benchmark_filename],
78 benchmark_dir, timeout)
79 output, error, exit_status = run_process(
80 bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
81 timeout, dump_output)
82 else:
83 output, error, exit_status = run_process(
84 bin_args + command_line + [benchmark_filename], benchmark_dir,
85 timeout)
86
87 # If a scrubber command has been specified then apply it to the output.
88 if scrubber:
89 output, _, _ = run_process(
90 shlex.split(scrubber), benchmark_dir, timeout, output)
91 if error_scrubber:
92 error, _, _ = run_process(
93 shlex.split(error_scrubber), benchmark_dir, timeout, error)
94
95 # Popen in Python 3 returns a bytes object instead of a string for
96 # stdout/stderr.
97 if isinstance(output, bytes):
98 output = output.decode()
99 if isinstance(error, bytes):
100 error = error.decode()
101 return (output.strip(), error.strip(), exit_status)
102
103
104 def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path, timeout):
105 """Determines the expected output for a benchmark, runs CVC4 on it and then
106 checks whether the output corresponds to the expected output. Optionally
107 uses a wrapper `wrapper`, tests proof generation (if proof is true), or
108 dumps a benchmark and uses that as the input (if dump is true)."""
109
110 if not os.access(cvc4_binary, os.X_OK):
111 sys.exit(
112 '"{}" does not exist or is not executable'.format(cvc4_binary))
113 if not os.path.isfile(benchmark_path):
114 sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
115
116 basic_command_line_args = []
117
118 benchmark_basename = os.path.basename(benchmark_path)
119 benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
120 benchmark_dir = os.path.dirname(benchmark_path)
121 comment_char = '%'
122 status_regex = None
123 status_to_output = lambda s: s
124 if benchmark_ext == '.smt':
125 status_regex = r':status\s*(sat|unsat)'
126 comment_char = ';'
127 elif benchmark_ext == '.smt2':
128 status_regex = r'set-info\s*:status\s*(sat|unsat)'
129 comment_char = ';'
130 elif benchmark_ext == '.cvc':
131 pass
132 elif benchmark_ext == '.p':
133 basic_command_line_args.append('--finite-model-find')
134 status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
135 status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
136 elif benchmark_ext == '.sy':
137 comment_char = ';'
138 # Do not use proofs/unsat-cores with .sy files
139 proof = False
140 else:
141 sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
142 benchmark_basename))
143
144 # If there is an ".expect" file for the benchmark, read the metadata
145 # from there, otherwise from the benchmark file.
146 metadata_filename = benchmark_path + '.expect'
147 if os.path.isfile(metadata_filename):
148 comment_char = '%'
149 else:
150 metadata_filename = benchmark_path
151
152 metadata_lines = None
153 with open(metadata_filename, 'r') as metadata_file:
154 metadata_lines = metadata_file.readlines()
155
156 benchmark_content = None
157 if metadata_filename == benchmark_path:
158 benchmark_content = ''.join(metadata_lines)
159 else:
160 with open(benchmark_path, 'r') as benchmark_file:
161 benchmark_content = benchmark_file.read()
162
163 # Extract the metadata for the benchmark.
164 scrubber = None
165 error_scrubber = None
166 expected_output = ''
167 expected_error = ''
168 expected_exit_status = None
169 command_line = ''
170 for line in metadata_lines:
171 # Skip lines that do not start with a comment character.
172 if line[0] != comment_char:
173 continue
174 line = line[1:].lstrip()
175
176 if line.startswith(SCRUBBER):
177 scrubber = line[len(SCRUBBER):]
178 elif line.startswith(ERROR_SCRUBBER):
179 error_scrubber = line[len(ERROR_SCRUBBER):]
180 elif line.startswith(EXPECT):
181 expected_output += line[len(EXPECT):]
182 elif line.startswith(EXPECT_ERROR):
183 expected_error += line[len(EXPECT_ERROR):]
184 elif line.startswith(EXIT):
185 expected_exit_status = int(line[len(EXIT):])
186 elif line.startswith(COMMAND_LINE):
187 command_line += line[len(COMMAND_LINE):]
188 expected_output = expected_output.strip()
189 expected_error = expected_error.strip()
190
191 # Expected output/expected error has not been defined in the metadata for
192 # the benchmark. Try to extract the information from the benchmark itself.
193 if expected_output == '' and expected_error == '':
194 match = None
195 if status_regex:
196 match = re.search(status_regex, benchmark_content)
197
198 if match:
199 expected_output = status_to_output(match.group(1))
200 elif expected_exit_status is None:
201 # If there is no expected output/error and the exit status has not
202 # been set explicitly, the benchmark is invalid.
203 sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
204 if expected_exit_status is None:
205 expected_exit_status = 0
206
207 if 'CVC4_REGRESSION_ARGS' in os.environ:
208 basic_command_line_args += shlex.split(
209 os.environ['CVC4_REGRESSION_ARGS'])
210 basic_command_line_args += shlex.split(command_line)
211 command_line_args_configs = [basic_command_line_args]
212 if not proof and ('(get-unsat-core)' in benchmark_content
213 or '(get-unsat-assumptions)' in benchmark_content
214 or '--check-proofs' in basic_command_line_args
215 or '--dump-proofs' in basic_command_line_args):
216 print(
217 '1..0 # Skipped: unsat cores not supported without proof support')
218 return
219
220 extra_command_line_args = []
221 if benchmark_ext == '.sy' and \
222 '--no-check-synth-sol' not in basic_command_line_args and \
223 '--check-synth-sol' not in basic_command_line_args:
224 extra_command_line_args = ['--check-synth-sol']
225 if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
226 '--no-check-models' not in basic_command_line_args:
227 extra_command_line_args = ['--check-models']
228 if proof and re.search(r'^(unsat|valid)$', expected_output):
229 if '--no-check-proofs' not in basic_command_line_args and \
230 '--incremental' not in basic_command_line_args and \
231 '--unconstrained-simp' not in basic_command_line_args and \
232 not cvc4_binary.endswith('pcvc4'):
233 extra_command_line_args = [
234 '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
235 '--no-bv-algebraic'
236 ]
237 if '--no-check-unsat-cores' not in basic_command_line_args and \
238 '--incremental' not in basic_command_line_args and \
239 '--unconstrained-simp' not in basic_command_line_args and \
240 not cvc4_binary.endswith('pcvc4'):
241 extra_command_line_args += ['--check-unsat-cores']
242 if extra_command_line_args:
243 command_line_args_configs.append(
244 basic_command_line_args + extra_command_line_args)
245
246 # Run CVC4 on the benchmark with the different option sets and check
247 # whether the exit status, stdout output, stderr output are as expected.
248 print('1..{}'.format(len(command_line_args_configs)))
249 print('# Starting')
250 for command_line_args in command_line_args_configs:
251 output, error, exit_status = run_benchmark(
252 dump, wrapper, scrubber, error_scrubber, cvc4_binary,
253 command_line_args, benchmark_dir, benchmark_basename, timeout)
254 if output != expected_output:
255 print(
256 'not ok - Differences between expected and actual output on stdout - Flags: {}'.
257 format(command_line_args))
258 for line in difflib.context_diff(output.splitlines(),
259 expected_output.splitlines()):
260 print(line)
261 elif error != expected_error:
262 print(
263 'not ok - Differences between expected and actual output on stderr - Flags: {}'.
264 format(command_line_args))
265 for line in difflib.context_diff(error.splitlines(),
266 expected_error.splitlines()):
267 print(line)
268 elif expected_exit_status != exit_status:
269 print(
270 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
271 format(expected_exit_status, exit_status, command_line_args))
272 else:
273 print('ok - Flags: {}'.format(command_line_args))
274
275
276 def main():
277 """Parses the command line arguments and then calls the core of the
278 script."""
279
280 parser = argparse.ArgumentParser(
281 description=
282 'Runs benchmark and checks for correct exit status and output.')
283 parser.add_argument('--proof', action='store_true')
284 parser.add_argument('--dump', action='store_true')
285 parser.add_argument('wrapper', nargs='*')
286 parser.add_argument('cvc4_binary')
287 parser.add_argument('benchmark')
288 args = parser.parse_args()
289 cvc4_binary = os.path.abspath(args.cvc4_binary)
290
291 wrapper = args.wrapper
292 if os.environ.get('VALGRIND') == '1' and not wrapper:
293 wrapper = ['libtool', '--mode=execute', 'valgrind']
294
295 timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
296
297 run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark,
298 timeout)
299
300
301 if __name__ == "__main__":
302 main()