Support multiple sets of command line args in regs (#1902)
[cvc5.git] / test / regress / run_regression.py
1 #!/usr/bin/env python3
2 """
3 Usage:
4
5 run_regression.py [ --proof | --dump ] [ wrapper ] cvc4-binary
6 [ benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p ]
7
8 Runs benchmark and checks for correct exit status and output.
9 """
10
11 import argparse
12 import difflib
13 import os
14 import re
15 import shlex
16 import subprocess
17 import sys
18 import threading
19
20 SCRUBBER = 'SCRUBBER: '
21 ERROR_SCRUBBER = 'ERROR-SCRUBBER: '
22 EXPECT = 'EXPECT: '
23 EXPECT_ERROR = 'EXPECT-ERROR: '
24 EXIT = 'EXIT: '
25 COMMAND_LINE = 'COMMAND-LINE: '
26
27
28 def run_process(args, cwd, timeout, s_input=None):
29 """Runs a process with a timeout `timeout` in seconds. `args` are the
30 arguments to execute, `cwd` is the working directory and `s_input` is the
31 input to be sent to the process over stdin. Returns the output, the error
32 output and the exit code of the process. If the process times out, the
33 output and the error output are empty and the exit code is 124."""
34
35 proc = subprocess.Popen(
36 args,
37 cwd=cwd,
38 stdin=subprocess.PIPE,
39 stdout=subprocess.PIPE,
40 stderr=subprocess.PIPE)
41
42 out = ''
43 err = ''
44 exit_status = 124
45 timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
46 try:
47 timer.start()
48 out, err = proc.communicate(input=s_input)
49 exit_status = proc.returncode
50 finally:
51 timer.cancel()
52
53 return out, err, exit_status
54
55
56 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
57 command_line, benchmark_dir, benchmark_filename, timeout):
58 """Runs CVC4 on the file `benchmark_filename` in the directory
59 `benchmark_dir` using the binary `cvc4_binary` with the command line
60 options `command_line`. The output is scrubbed using `scrubber` and
61 `error_scrubber` for stdout and stderr, respectively. If dump is true, the
62 function first uses CVC4 to read in and dump the benchmark file and then
63 uses that as input."""
64
65 bin_args = wrapper[:]
66 bin_args.append(cvc4_binary)
67
68 output = None
69 error = None
70 exit_status = None
71 if dump:
72 dump_args = [
73 '--preprocess-only', '--dump', 'raw-benchmark',
74 '--output-lang=smt2', '-qq'
75 ]
76 dump_output, _, _ = run_process(
77 bin_args + command_line + dump_args + [benchmark_filename],
78 benchmark_dir, timeout)
79 output, error, exit_status = run_process(
80 bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
81 timeout, dump_output)
82 else:
83 output, error, exit_status = run_process(
84 bin_args + command_line + [benchmark_filename], benchmark_dir,
85 timeout)
86
87 # If a scrubber command has been specified then apply it to the output.
88 if scrubber:
89 output, _, _ = run_process(
90 shlex.split(scrubber), benchmark_dir, timeout, output)
91 if error_scrubber:
92 error, _, _ = run_process(
93 shlex.split(error_scrubber), benchmark_dir, timeout, error)
94
95 # Popen in Python 3 returns a bytes object instead of a string for
96 # stdout/stderr.
97 if isinstance(output, bytes):
98 output = output.decode()
99 if isinstance(error, bytes):
100 error = error.decode()
101 return (output.strip(), error.strip(), exit_status)
102
103
104 def run_regression(proof, dump, wrapper, cvc4_binary, benchmark_path, timeout):
105 """Determines the expected output for a benchmark, runs CVC4 on it and then
106 checks whether the output corresponds to the expected output. Optionally
107 uses a wrapper `wrapper`, tests proof generation (if proof is true), or
108 dumps a benchmark and uses that as the input (if dump is true)."""
109
110 if not os.access(cvc4_binary, os.X_OK):
111 sys.exit(
112 '"{}" does not exist or is not executable'.format(cvc4_binary))
113 if not os.path.isfile(benchmark_path):
114 sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
115
116 basic_command_line_args = []
117
118 benchmark_basename = os.path.basename(benchmark_path)
119 benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
120 benchmark_dir = os.path.dirname(benchmark_path)
121 comment_char = '%'
122 status_regex = None
123 status_to_output = lambda s: s
124 if benchmark_ext == '.smt':
125 status_regex = r':status\s*(sat|unsat)'
126 comment_char = ';'
127 elif benchmark_ext == '.smt2':
128 status_regex = r'set-info\s*:status\s*(sat|unsat)'
129 comment_char = ';'
130 elif benchmark_ext == '.cvc':
131 pass
132 elif benchmark_ext == '.p':
133 basic_command_line_args.append('--finite-model-find')
134 status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
135 status_to_output = lambda s: '% SZS status {} for {}'.format(s, benchmark_filename)
136 elif benchmark_ext == '.sy':
137 comment_char = ';'
138 # Do not use proofs/unsat-cores with .sy files
139 proof = False
140 else:
141 sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
142 benchmark_basename))
143
144 # If there is an ".expect" file for the benchmark, read the metadata
145 # from there, otherwise from the benchmark file.
146 metadata_filename = benchmark_path + '.expect'
147 if os.path.isfile(metadata_filename):
148 comment_char = '%'
149 else:
150 metadata_filename = benchmark_path
151
152 metadata_lines = None
153 with open(metadata_filename, 'r') as metadata_file:
154 metadata_lines = metadata_file.readlines()
155
156 benchmark_content = None
157 if metadata_filename == benchmark_path:
158 benchmark_content = ''.join(metadata_lines)
159 else:
160 with open(benchmark_path, 'r') as benchmark_file:
161 benchmark_content = benchmark_file.read()
162
163 # Extract the metadata for the benchmark.
164 scrubber = None
165 error_scrubber = None
166 expected_output = ''
167 expected_error = ''
168 expected_exit_status = None
169 command_lines = []
170 for line in metadata_lines:
171 # Skip lines that do not start with a comment character.
172 if line[0] != comment_char:
173 continue
174 line = line[1:].lstrip()
175
176 if line.startswith(SCRUBBER):
177 scrubber = line[len(SCRUBBER):]
178 elif line.startswith(ERROR_SCRUBBER):
179 error_scrubber = line[len(ERROR_SCRUBBER):]
180 elif line.startswith(EXPECT):
181 expected_output += line[len(EXPECT):]
182 elif line.startswith(EXPECT_ERROR):
183 expected_error += line[len(EXPECT_ERROR):]
184 elif line.startswith(EXIT):
185 expected_exit_status = int(line[len(EXIT):])
186 elif line.startswith(COMMAND_LINE):
187 command_lines.append(line[len(COMMAND_LINE):])
188 expected_output = expected_output.strip()
189 expected_error = expected_error.strip()
190
191 # Expected output/expected error has not been defined in the metadata for
192 # the benchmark. Try to extract the information from the benchmark itself.
193 if expected_output == '' and expected_error == '':
194 match = None
195 if status_regex:
196 match = re.search(status_regex, benchmark_content)
197
198 if match:
199 expected_output = status_to_output(match.group(1))
200 elif expected_exit_status is None:
201 # If there is no expected output/error and the exit status has not
202 # been set explicitly, the benchmark is invalid.
203 sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
204 if expected_exit_status is None:
205 expected_exit_status = 0
206
207 if 'CVC4_REGRESSION_ARGS' in os.environ:
208 basic_command_line_args += shlex.split(
209 os.environ['CVC4_REGRESSION_ARGS'])
210
211 if not proof and ('(get-unsat-core)' in benchmark_content
212 or '(get-unsat-assumptions)' in benchmark_content
213 or '--check-proofs' in basic_command_line_args
214 or '--dump-proofs' in basic_command_line_args):
215 print(
216 '1..0 # Skipped regression: unsat cores not supported without proof support'
217 )
218 return
219
220 if not command_lines:
221 command_lines.append('')
222
223 command_line_args_configs = []
224 for command_line in command_lines:
225 args = shlex.split(command_line)
226 if proof or ('--check-proofs' not in args
227 and '--dump-proofs' not in args):
228 all_args = basic_command_line_args + args
229 command_line_args_configs.append(all_args)
230
231 extra_command_line_args = []
232 if benchmark_ext == '.sy' and \
233 '--no-check-synth-sol' not in all_args and \
234 '--check-synth-sol' not in all_args:
235 extra_command_line_args = ['--check-synth-sol']
236 if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
237 '--no-check-models' not in all_args:
238 extra_command_line_args = ['--check-models']
239 if proof and re.search(r'^(unsat|valid)$', expected_output):
240 if '--no-check-proofs' not in all_args and \
241 '--incremental' not in all_args and \
242 '--unconstrained-simp' not in all_args and \
243 not cvc4_binary.endswith('pcvc4'):
244 extra_command_line_args = [
245 '--check-proofs', '--no-bv-eq', '--no-bv-ineq',
246 '--no-bv-algebraic'
247 ]
248 if '--no-check-unsat-cores' not in all_args and \
249 '--incremental' not in all_args and \
250 '--unconstrained-simp' not in all_args and \
251 not cvc4_binary.endswith('pcvc4'):
252 extra_command_line_args += ['--check-unsat-cores']
253 if extra_command_line_args:
254 command_line_args_configs.append(
255 all_args + extra_command_line_args)
256
257 # Run CVC4 on the benchmark with the different option sets and check
258 # whether the exit status, stdout output, stderr output are as expected.
259 print('1..{}'.format(len(command_line_args_configs)))
260 print('# Starting')
261 for command_line_args in command_line_args_configs:
262 output, error, exit_status = run_benchmark(
263 dump, wrapper, scrubber, error_scrubber, cvc4_binary,
264 command_line_args, benchmark_dir, benchmark_basename, timeout)
265 if output != expected_output:
266 print(
267 'not ok - Differences between expected and actual output on stdout - Flags: {}'.
268 format(command_line_args))
269 for line in difflib.context_diff(output.splitlines(),
270 expected_output.splitlines()):
271 print(line)
272 elif error != expected_error:
273 print(
274 'not ok - Differences between expected and actual output on stderr - Flags: {}'.
275 format(command_line_args))
276 for line in difflib.context_diff(error.splitlines(),
277 expected_error.splitlines()):
278 print(line)
279 elif expected_exit_status != exit_status:
280 print(
281 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
282 format(expected_exit_status, exit_status, command_line_args))
283 else:
284 print('ok - Flags: {}'.format(command_line_args))
285
286
287 def main():
288 """Parses the command line arguments and then calls the core of the
289 script."""
290
291 parser = argparse.ArgumentParser(
292 description=
293 'Runs benchmark and checks for correct exit status and output.')
294 parser.add_argument('--proof', action='store_true')
295 parser.add_argument('--dump', action='store_true')
296 parser.add_argument('wrapper', nargs='*')
297 parser.add_argument('cvc4_binary')
298 parser.add_argument('benchmark')
299 args = parser.parse_args()
300 cvc4_binary = os.path.abspath(args.cvc4_binary)
301
302 wrapper = args.wrapper
303 if os.environ.get('VALGRIND') == '1' and not wrapper:
304 wrapper = ['libtool', '--mode=execute', 'valgrind']
305
306 timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
307
308 run_regression(args.proof, args.dump, wrapper, cvc4_binary, args.benchmark,
309 timeout)
310
311
312 if __name__ == "__main__":
313 main()