run_regression.py to fail on invalid requirements (#5264)
[cvc5.git] / test / regress / run_regression.py
1 #!/usr/bin/env python3
2 #####################
3 ## run_regression.py
4 ## Top contributors (to current version):
5 ## Andres Noetzli, Yoni Zohar, Mathias Preiner
6 ## This file is part of the CVC4 project.
7 ## Copyright (c) 2009-2020 by the authors listed in the file AUTHORS
8 ## in the top-level source directory and their institutional affiliations.
9 ## All rights reserved. See the file COPYING in the top-level source
10 ## directory for licensing information.
11 ##
12 """
13 Usage:
14
15 run_regression.py [--enable-proof] [--with-lfsc] [--dump]
16 [--use-skip-return-code] [wrapper] cvc4-binary
17 [benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p]
18
19 Runs benchmark and checks for correct exit status and output.
20 """
21
22 import argparse
23 import difflib
24 import os
25 import re
26 import shlex
27 import subprocess
28 import sys
29 import threading
30
31
32 class Color:
33 BLUE = '\033[94m'
34 GREEN = '\033[92m'
35 YELLOW = '\033[93m'
36 RED = '\033[91m'
37 ENDC = '\033[0m'
38
39
40 SCRUBBER = 'SCRUBBER:'
41 ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
42 EXPECT = 'EXPECT:'
43 EXPECT_ERROR = 'EXPECT-ERROR:'
44 EXIT = 'EXIT:'
45 COMMAND_LINE = 'COMMAND-LINE:'
46 REQUIRES = 'REQUIRES:'
47
48 EXIT_OK = 0
49 EXIT_FAILURE = 1
50 EXIT_SKIP = 77
51 STATUS_TIMEOUT = 124
52
53
54 def print_colored(color, text):
55 """Prints `text` in color `color`."""
56
57 for line in text.splitlines():
58 print(color + line + Color.ENDC)
59
60
61 def print_diff(actual, expected):
62 """Prints the difference between `actual` and `expected`."""
63
64 for line in difflib.unified_diff(expected.splitlines(),
65 actual.splitlines(),
66 'expected',
67 'actual',
68 lineterm=''):
69 if line.startswith('+'):
70 print_colored(Color.GREEN, line)
71 elif line.startswith('-'):
72 print_colored(Color.RED, line)
73 else:
74 print(line)
75
76
77 def run_process(args, cwd, timeout, s_input=None):
78 """Runs a process with a timeout `timeout` in seconds. `args` are the
79 arguments to execute, `cwd` is the working directory and `s_input` is the
80 input to be sent to the process over stdin. Returns the output, the error
81 output and the exit code of the process. If the process times out, the
82 output and the error output are empty and the exit code is 124."""
83
84 proc = subprocess.Popen(args,
85 cwd=cwd,
86 stdin=subprocess.PIPE,
87 stdout=subprocess.PIPE,
88 stderr=subprocess.PIPE)
89
90 out = ''
91 err = ''
92 exit_status = STATUS_TIMEOUT
93 try:
94 if timeout:
95 timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
96 timer.start()
97 out, err = proc.communicate(input=s_input)
98 exit_status = proc.returncode
99 finally:
100 if timeout:
101 # The timer killed the process and is not active anymore.
102 if exit_status == -9 and not timer.is_alive():
103 exit_status = STATUS_TIMEOUT
104 timer.cancel()
105
106 return out, err, exit_status
107
108
109 def get_cvc4_features(cvc4_binary):
110 """Returns a list of features supported by the CVC4 binary `cvc4_binary`."""
111
112 output, _, _ = run_process([cvc4_binary, '--show-config'], None, None)
113 if isinstance(output, bytes):
114 output = output.decode()
115
116 features = []
117 disabled_features = []
118 for line in output.split('\n'):
119 tokens = [t.strip() for t in line.split(':')]
120 if len(tokens) == 2:
121 key, value = tokens
122 if value == 'yes':
123 features.append(key)
124 elif value == 'no':
125 disabled_features.append(key)
126
127 return features, disabled_features
128
129
130 def logic_supported_with_proofs(logic):
131 assert logic is None or isinstance(logic, str)
132 return logic in [
133 #single theories
134 "QF_BV",
135 "QF_UF",
136 "QF_A",
137 "QF_LRA",
138 #two theories
139 "QF_UFBV",
140 "QF_UFLRA",
141 "QF_AUF",
142 "QF_ALRA",
143 "QF_ABV",
144 "QF_BVLRA"
145 #three theories
146 "QF_AUFBV",
147 "QF_ABVLRA",
148 "QF_UFBVLRA",
149 "QF_AUFLRA",
150 #four theories
151 "QF_AUFBVLRA"
152 ]
153
154
155 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
156 command_line, benchmark_dir, benchmark_filename, timeout):
157 """Runs CVC4 on the file `benchmark_filename` in the directory
158 `benchmark_dir` using the binary `cvc4_binary` with the command line
159 options `command_line`. The output is scrubbed using `scrubber` and
160 `error_scrubber` for stdout and stderr, respectively. If dump is true, the
161 function first uses CVC4 to read in and dump the benchmark file and then
162 uses that as input."""
163
164 bin_args = wrapper[:]
165 bin_args.append(cvc4_binary)
166
167 output = None
168 error = None
169 exit_status = None
170 if dump:
171 dump_args = [
172 '--preprocess-only', '--dump', 'raw-benchmark',
173 '--output-lang=smt2', '-qq'
174 ]
175 dump_output, _, _ = run_process(
176 bin_args + command_line + dump_args + [benchmark_filename],
177 benchmark_dir, timeout)
178 output, error, exit_status = run_process(
179 bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
180 timeout, dump_output)
181 else:
182 output, error, exit_status = run_process(
183 bin_args + command_line + [benchmark_filename], benchmark_dir,
184 timeout)
185
186 # If a scrubber command has been specified then apply it to the output.
187 if scrubber:
188 output, _, _ = run_process(shlex.split(scrubber), benchmark_dir,
189 timeout, output)
190 if error_scrubber:
191 error, _, _ = run_process(shlex.split(error_scrubber), benchmark_dir,
192 timeout, error)
193
194 # Popen in Python 3 returns a bytes object instead of a string for
195 # stdout/stderr.
196 if isinstance(output, bytes):
197 output = output.decode()
198 if isinstance(error, bytes):
199 error = error.decode()
200 return (output.strip(), error.strip(), exit_status)
201
202
203 def run_regression(unsat_cores, proofs, dump, use_skip_return_code,
204 skip_timeout, wrapper, cvc4_binary, benchmark_path,
205 timeout):
206 """Determines the expected output for a benchmark, runs CVC4 on it and then
207 checks whether the output corresponds to the expected output. Optionally
208 uses a wrapper `wrapper`, tests unsat cores (if unsat_cores is true),
209 checks proofs (if proofs is true), or dumps a benchmark and uses that as
210 the input (if dump is true). `use_skip_return_code` enables/disables
211 returning 77 when a test is skipped."""
212
213 if not os.access(cvc4_binary, os.X_OK):
214 sys.exit(
215 '"{}" does not exist or is not executable'.format(cvc4_binary))
216 if not os.path.isfile(benchmark_path):
217 sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
218
219 cvc4_features, cvc4_disabled_features = get_cvc4_features(cvc4_binary)
220
221 basic_command_line_args = []
222
223 benchmark_basename = os.path.basename(benchmark_path)
224 benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
225 benchmark_dir = os.path.dirname(benchmark_path)
226 comment_char = '%'
227 status_regex = None
228 logic_regex = None
229 status_to_output = lambda s: s
230 if benchmark_ext == '.smt':
231 status_regex = r':status\s*(sat|unsat)'
232 comment_char = ';'
233 elif benchmark_ext == '.smt2':
234 status_regex = r'set-info\s*:status\s*(sat|unsat)'
235 logic_regex = r'\(\s*set-logic\s*(.*)\)'
236 comment_char = ';'
237 elif benchmark_ext == '.cvc':
238 pass
239 elif benchmark_ext == '.p':
240 status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
241 status_to_output = lambda s: '% SZS status {} for {}'.format(
242 s, benchmark_filename)
243 elif benchmark_ext == '.sy':
244 comment_char = ';'
245 # Do not use proofs/unsat-cores with .sy files
246 unsat_cores = False
247 proofs = False
248 else:
249 sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
250 benchmark_basename))
251
252 benchmark_lines = None
253 with open(benchmark_path, 'r') as benchmark_file:
254 benchmark_lines = benchmark_file.readlines()
255 benchmark_content = ''.join(benchmark_lines)
256
257 # Extract the metadata for the benchmark.
258 scrubber = None
259 error_scrubber = None
260 expected_output = ''
261 expected_error = ''
262 expected_exit_status = None
263 command_lines = []
264 requires = []
265 logic = None
266 for line in benchmark_lines:
267 # Skip lines that do not start with a comment character.
268 if line[0] != comment_char:
269 continue
270 line = line[1:].lstrip()
271
272 if line.startswith(SCRUBBER):
273 scrubber = line[len(SCRUBBER):].strip()
274 elif line.startswith(ERROR_SCRUBBER):
275 error_scrubber = line[len(ERROR_SCRUBBER):].strip()
276 elif line.startswith(EXPECT):
277 expected_output += line[len(EXPECT):].strip() + '\n'
278 elif line.startswith(EXPECT_ERROR):
279 expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
280 elif line.startswith(EXIT):
281 expected_exit_status = int(line[len(EXIT):].strip())
282 elif line.startswith(COMMAND_LINE):
283 command_lines.append(line[len(COMMAND_LINE):].strip())
284 elif line.startswith(REQUIRES):
285 requires.append(line[len(REQUIRES):].strip())
286 expected_output = expected_output.strip()
287 expected_error = expected_error.strip()
288
289 # Expected output/expected error has not been defined in the metadata for
290 # the benchmark. Try to extract the information from the benchmark itself.
291 if expected_output == '' and expected_error == '':
292 match = None
293 if status_regex:
294 match = re.findall(status_regex, benchmark_content)
295
296 if match:
297 expected_output = status_to_output('\n'.join(match))
298 elif expected_exit_status is None:
299 # If there is no expected output/error and the exit status has not
300 # been set explicitly, the benchmark is invalid.
301 sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
302 if expected_exit_status is None:
303 expected_exit_status = 0
304 if logic_regex:
305 logic_match = re.findall(logic_regex, benchmark_content)
306 if logic_match and len(logic_match) == 1:
307 logic = logic_match[0]
308
309 if 'CVC4_REGRESSION_ARGS' in os.environ:
310 basic_command_line_args += shlex.split(
311 os.environ['CVC4_REGRESSION_ARGS'])
312
313 if not unsat_cores and ('(get-unsat-core)' in benchmark_content
314 or '(get-unsat-assumptions)' in benchmark_content):
315 print(
316 '1..0 # Skipped regression: unsat cores not supported without proof support'
317 )
318 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
319
320 for req_feature in requires:
321 is_negative = False
322 if req_feature.startswith("no-"):
323 req_feature = req_feature[len("no-"):]
324 is_negative = True
325 if req_feature not in (cvc4_features + cvc4_disabled_features):
326 print(
327 'Illegal requirement in regression: {}\nAllowed requirements: {}'
328 .format(req_feature,
329 ' '.join(cvc4_features + cvc4_disabled_features)))
330 return EXIT_FAILURE
331 if is_negative:
332 if req_feature in cvc4_features:
333 print('1..0 # Skipped regression: not valid with {}'.format(
334 req_feature))
335 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
336 elif req_feature not in cvc4_features:
337 print('1..0 # Skipped regression: {} not supported'.format(
338 req_feature))
339 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
340
341 if not command_lines:
342 command_lines.append('')
343
344 command_line_args_configs = []
345 for command_line in command_lines:
346 args = shlex.split(command_line)
347 all_args = basic_command_line_args + args
348
349 if not unsat_cores and ('--check-unsat-cores' in all_args):
350 print(
351 '# Skipped command line options ({}): unsat cores not supported without proof support'
352 .format(all_args))
353 continue
354 if not proofs and '--dump-proofs' in all_args:
355 print(
356 '# Skipped command line options ({}): proof production not supported without LFSC support'
357 .format(all_args))
358 continue
359
360 command_line_args_configs.append(all_args)
361
362 extra_command_line_args = []
363 if benchmark_ext == '.sy' and \
364 '--no-check-synth-sol' not in all_args and \
365 '--sygus-rr' not in all_args and \
366 '--check-synth-sol' not in all_args:
367 extra_command_line_args = ['--check-synth-sol']
368 if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
369 '--no-debug-check-models' not in all_args and \
370 '--no-check-models' not in all_args and \
371 '--debug-check-models' not in all_args:
372 extra_command_line_args = ['--debug-check-models']
373 if unsat_cores and re.search(r'^(unsat|valid)$', expected_output):
374 if '--no-check-unsat-cores' not in all_args and \
375 '--check-unsat-cores' not in all_args and \
376 '--incremental' not in all_args and \
377 '--unconstrained-simp' not in all_args:
378 extra_command_line_args += ['--check-unsat-cores']
379 if '--no-check-abducts' not in all_args and \
380 '--check-abducts' not in all_args:
381 extra_command_line_args += ['--check-abducts']
382 if extra_command_line_args:
383 command_line_args_configs.append(all_args +
384 extra_command_line_args)
385
386 # Run CVC4 on the benchmark with the different option sets and check
387 # whether the exit status, stdout output, stderr output are as expected.
388 print('1..{}'.format(len(command_line_args_configs)))
389 print('# Starting')
390 exit_code = EXIT_OK
391 for command_line_args in command_line_args_configs:
392 output, error, exit_status = run_benchmark(dump, wrapper, scrubber,
393 error_scrubber, cvc4_binary,
394 command_line_args,
395 benchmark_dir,
396 benchmark_basename, timeout)
397 output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE)
398 error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE)
399 if exit_status == STATUS_TIMEOUT:
400 exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE
401 print('Timeout - Flags: {}'.format(command_line_args))
402 elif output != expected_output:
403 exit_code = EXIT_FAILURE
404 print('not ok - Flags: {}'.format(command_line_args))
405 print()
406 print('Standard output difference')
407 print('=' * 80)
408 print_diff(output, expected_output)
409 print('=' * 80)
410 print()
411 print()
412 if error:
413 print('Error output')
414 print('=' * 80)
415 print_colored(Color.YELLOW, error)
416 print('=' * 80)
417 print()
418 elif error != expected_error:
419 exit_code = EXIT_FAILURE
420 print(
421 'not ok - Differences between expected and actual output on stderr - Flags: {}'
422 .format(command_line_args))
423 print()
424 print('Error output difference')
425 print('=' * 80)
426 print_diff(error, expected_error)
427 print('=' * 80)
428 print()
429 elif expected_exit_status != exit_status:
430 exit_code = EXIT_FAILURE
431 print(
432 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
433 format(expected_exit_status, exit_status, command_line_args))
434 print()
435 print('Output:')
436 print('=' * 80)
437 print_colored(Color.BLUE, output)
438 print('=' * 80)
439 print()
440 print()
441 print('Error output:')
442 print('=' * 80)
443 print_colored(Color.YELLOW, error)
444 print('=' * 80)
445 print()
446 else:
447 print('ok - Flags: {}'.format(command_line_args))
448
449 return exit_code
450
451
452 def main():
453 """Parses the command line arguments and then calls the core of the
454 script."""
455
456 parser = argparse.ArgumentParser(
457 description=
458 'Runs benchmark and checks for correct exit status and output.')
459 parser.add_argument('--enable-proof', action='store_true')
460 parser.add_argument('--with-lfsc', action='store_true')
461 parser.add_argument('--dump', action='store_true')
462 parser.add_argument('--use-skip-return-code', action='store_true')
463 parser.add_argument('--skip-timeout', action='store_true')
464 parser.add_argument('wrapper', nargs='*')
465 parser.add_argument('cvc4_binary')
466 parser.add_argument('benchmark')
467 args = parser.parse_args()
468 cvc4_binary = os.path.abspath(args.cvc4_binary)
469
470 wrapper = args.wrapper
471 if os.environ.get('VALGRIND') == '1' and not wrapper:
472 wrapper = ['libtool', '--mode=execute', 'valgrind']
473
474 timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
475
476 return run_regression(args.enable_proof, args.with_lfsc, args.dump,
477 args.use_skip_return_code, args.skip_timeout,
478 wrapper, cvc4_binary, args.benchmark, timeout)
479
480
481 if __name__ == "__main__":
482 exit_code = main()
483 sys.exit(exit_code)