run_regression: Add --skip-timeout option, lower timeout to 600 seconds. (#5353)
[cvc5.git] / test / regress / run_regression.py
1 #!/usr/bin/env python3
2 #####################
3 ## run_regression.py
4 ## Top contributors (to current version):
5 ## Andres Noetzli, Yoni Zohar, Mathias Preiner
6 ## This file is part of the CVC4 project.
7 ## Copyright (c) 2009-2020 by the authors listed in the file AUTHORS
8 ## in the top-level source directory and their institutional affiliations.
9 ## All rights reserved. See the file COPYING in the top-level source
10 ## directory for licensing information.
11 ##
12 """
13 Usage:
14
15 run_regression.py [--enable-proof] [--with-lfsc] [--dump]
16 [--use-skip-return-code] [wrapper] cvc4-binary
17 [benchmark.cvc | benchmark.smt | benchmark.smt2 | benchmark.p]
18
19 Runs benchmark and checks for correct exit status and output.
20 """
21
22 import argparse
23 import difflib
24 import os
25 import re
26 import shlex
27 import subprocess
28 import sys
29 import threading
30
31
32 class Color:
33 BLUE = '\033[94m'
34 GREEN = '\033[92m'
35 YELLOW = '\033[93m'
36 RED = '\033[91m'
37 ENDC = '\033[0m'
38
39
40 SCRUBBER = 'SCRUBBER:'
41 ERROR_SCRUBBER = 'ERROR-SCRUBBER:'
42 EXPECT = 'EXPECT:'
43 EXPECT_ERROR = 'EXPECT-ERROR:'
44 EXIT = 'EXIT:'
45 COMMAND_LINE = 'COMMAND-LINE:'
46 REQUIRES = 'REQUIRES:'
47
48 EXIT_OK = 0
49 EXIT_FAILURE = 1
50 EXIT_SKIP = 77
51 STATUS_TIMEOUT = 124
52
53
54 def print_colored(color, text):
55 """Prints `text` in color `color`."""
56
57 for line in text.splitlines():
58 print(color + line + Color.ENDC)
59
60
61 def print_diff(actual, expected):
62 """Prints the difference between `actual` and `expected`."""
63
64 for line in difflib.unified_diff(expected.splitlines(),
65 actual.splitlines(),
66 'expected',
67 'actual',
68 lineterm=''):
69 if line.startswith('+'):
70 print_colored(Color.GREEN, line)
71 elif line.startswith('-'):
72 print_colored(Color.RED, line)
73 else:
74 print(line)
75
76
77 def run_process(args, cwd, timeout, s_input=None):
78 """Runs a process with a timeout `timeout` in seconds. `args` are the
79 arguments to execute, `cwd` is the working directory and `s_input` is the
80 input to be sent to the process over stdin. Returns the output, the error
81 output and the exit code of the process. If the process times out, the
82 output and the error output are empty and the exit code is 124."""
83
84 proc = subprocess.Popen(args,
85 cwd=cwd,
86 stdin=subprocess.PIPE,
87 stdout=subprocess.PIPE,
88 stderr=subprocess.PIPE)
89
90 out = ''
91 err = ''
92 exit_status = STATUS_TIMEOUT
93 try:
94 if timeout:
95 timer = threading.Timer(timeout, lambda p: p.kill(), [proc])
96 timer.start()
97 out, err = proc.communicate(input=s_input)
98 exit_status = proc.returncode
99 finally:
100 if timeout:
101 # The timer killed the process and is not active anymore.
102 if exit_status == -9 and not timer.is_alive():
103 exit_status = STATUS_TIMEOUT
104 timer.cancel()
105
106 return out, err, exit_status
107
108
109 def get_cvc4_features(cvc4_binary):
110 """Returns a list of features supported by the CVC4 binary `cvc4_binary`."""
111
112 output, _, _ = run_process([cvc4_binary, '--show-config'], None, None)
113 if isinstance(output, bytes):
114 output = output.decode()
115
116 features = []
117 for line in output.split('\n'):
118 tokens = [t.strip() for t in line.split(':')]
119 if len(tokens) == 2:
120 key, value = tokens
121 if value == 'yes':
122 features.append(key)
123
124 return features
125
126
127 def logic_supported_with_proofs(logic):
128 assert logic is None or isinstance(logic, str)
129 return logic in [
130 #single theories
131 "QF_BV",
132 "QF_UF",
133 "QF_A",
134 "QF_LRA",
135 #two theories
136 "QF_UFBV",
137 "QF_UFLRA",
138 "QF_AUF",
139 "QF_ALRA",
140 "QF_ABV",
141 "QF_BVLRA"
142 #three theories
143 "QF_AUFBV",
144 "QF_ABVLRA",
145 "QF_UFBVLRA",
146 "QF_AUFLRA",
147 #four theories
148 "QF_AUFBVLRA"
149 ]
150
151
152 def run_benchmark(dump, wrapper, scrubber, error_scrubber, cvc4_binary,
153 command_line, benchmark_dir, benchmark_filename, timeout):
154 """Runs CVC4 on the file `benchmark_filename` in the directory
155 `benchmark_dir` using the binary `cvc4_binary` with the command line
156 options `command_line`. The output is scrubbed using `scrubber` and
157 `error_scrubber` for stdout and stderr, respectively. If dump is true, the
158 function first uses CVC4 to read in and dump the benchmark file and then
159 uses that as input."""
160
161 bin_args = wrapper[:]
162 bin_args.append(cvc4_binary)
163
164 output = None
165 error = None
166 exit_status = None
167 if dump:
168 dump_args = [
169 '--preprocess-only', '--dump', 'raw-benchmark',
170 '--output-lang=smt2', '-qq'
171 ]
172 dump_output, _, _ = run_process(
173 bin_args + command_line + dump_args + [benchmark_filename],
174 benchmark_dir, timeout)
175 output, error, exit_status = run_process(
176 bin_args + command_line + ['--lang=smt2', '-'], benchmark_dir,
177 timeout, dump_output)
178 else:
179 output, error, exit_status = run_process(
180 bin_args + command_line + [benchmark_filename], benchmark_dir,
181 timeout)
182
183 # If a scrubber command has been specified then apply it to the output.
184 if scrubber:
185 output, _, _ = run_process(shlex.split(scrubber), benchmark_dir,
186 timeout, output)
187 if error_scrubber:
188 error, _, _ = run_process(shlex.split(error_scrubber), benchmark_dir,
189 timeout, error)
190
191 # Popen in Python 3 returns a bytes object instead of a string for
192 # stdout/stderr.
193 if isinstance(output, bytes):
194 output = output.decode()
195 if isinstance(error, bytes):
196 error = error.decode()
197 return (output.strip(), error.strip(), exit_status)
198
199
200 def run_regression(unsat_cores, proofs, dump, use_skip_return_code,
201 skip_timeout, wrapper, cvc4_binary, benchmark_path,
202 timeout):
203 """Determines the expected output for a benchmark, runs CVC4 on it and then
204 checks whether the output corresponds to the expected output. Optionally
205 uses a wrapper `wrapper`, tests unsat cores (if unsat_cores is true),
206 checks proofs (if proofs is true), or dumps a benchmark and uses that as
207 the input (if dump is true). `use_skip_return_code` enables/disables
208 returning 77 when a test is skipped."""
209
210 if not os.access(cvc4_binary, os.X_OK):
211 sys.exit(
212 '"{}" does not exist or is not executable'.format(cvc4_binary))
213 if not os.path.isfile(benchmark_path):
214 sys.exit('"{}" does not exist or is not a file'.format(benchmark_path))
215
216 cvc4_features = get_cvc4_features(cvc4_binary)
217
218 basic_command_line_args = []
219
220 benchmark_basename = os.path.basename(benchmark_path)
221 benchmark_filename, benchmark_ext = os.path.splitext(benchmark_basename)
222 benchmark_dir = os.path.dirname(benchmark_path)
223 comment_char = '%'
224 status_regex = None
225 logic_regex = None
226 status_to_output = lambda s: s
227 if benchmark_ext == '.smt':
228 status_regex = r':status\s*(sat|unsat)'
229 comment_char = ';'
230 elif benchmark_ext == '.smt2':
231 status_regex = r'set-info\s*:status\s*(sat|unsat)'
232 logic_regex = r'\(\s*set-logic\s*(.*)\)'
233 comment_char = ';'
234 elif benchmark_ext == '.cvc':
235 pass
236 elif benchmark_ext == '.p':
237 status_regex = r'% Status\s*:\s*(Theorem|Unsatisfiable|CounterSatisfiable|Satisfiable)'
238 status_to_output = lambda s: '% SZS status {} for {}'.format(
239 s, benchmark_filename)
240 elif benchmark_ext == '.sy':
241 comment_char = ';'
242 # Do not use proofs/unsat-cores with .sy files
243 unsat_cores = False
244 proofs = False
245 else:
246 sys.exit('"{}" must be *.cvc or *.smt or *.smt2 or *.p or *.sy'.format(
247 benchmark_basename))
248
249 benchmark_lines = None
250 with open(benchmark_path, 'r') as benchmark_file:
251 benchmark_lines = benchmark_file.readlines()
252 benchmark_content = ''.join(benchmark_lines)
253
254 # Extract the metadata for the benchmark.
255 scrubber = None
256 error_scrubber = None
257 expected_output = ''
258 expected_error = ''
259 expected_exit_status = None
260 command_lines = []
261 requires = []
262 logic = None
263 for line in benchmark_lines:
264 # Skip lines that do not start with a comment character.
265 if line[0] != comment_char:
266 continue
267 line = line[1:].lstrip()
268
269 if line.startswith(SCRUBBER):
270 scrubber = line[len(SCRUBBER):].strip()
271 elif line.startswith(ERROR_SCRUBBER):
272 error_scrubber = line[len(ERROR_SCRUBBER):].strip()
273 elif line.startswith(EXPECT):
274 expected_output += line[len(EXPECT):].strip() + '\n'
275 elif line.startswith(EXPECT_ERROR):
276 expected_error += line[len(EXPECT_ERROR):].strip() + '\n'
277 elif line.startswith(EXIT):
278 expected_exit_status = int(line[len(EXIT):].strip())
279 elif line.startswith(COMMAND_LINE):
280 command_lines.append(line[len(COMMAND_LINE):].strip())
281 elif line.startswith(REQUIRES):
282 requires.append(line[len(REQUIRES):].strip())
283 expected_output = expected_output.strip()
284 expected_error = expected_error.strip()
285
286 # Expected output/expected error has not been defined in the metadata for
287 # the benchmark. Try to extract the information from the benchmark itself.
288 if expected_output == '' and expected_error == '':
289 match = None
290 if status_regex:
291 match = re.findall(status_regex, benchmark_content)
292
293 if match:
294 expected_output = status_to_output('\n'.join(match))
295 elif expected_exit_status is None:
296 # If there is no expected output/error and the exit status has not
297 # been set explicitly, the benchmark is invalid.
298 sys.exit('Cannot determine status of "{}"'.format(benchmark_path))
299 if expected_exit_status is None:
300 expected_exit_status = 0
301 if logic_regex:
302 logic_match = re.findall(logic_regex, benchmark_content)
303 if logic_match and len(logic_match) == 1:
304 logic = logic_match[0]
305
306 if 'CVC4_REGRESSION_ARGS' in os.environ:
307 basic_command_line_args += shlex.split(
308 os.environ['CVC4_REGRESSION_ARGS'])
309
310 if not unsat_cores and ('(get-unsat-core)' in benchmark_content
311 or '(get-unsat-assumptions)' in benchmark_content):
312 print(
313 '1..0 # Skipped regression: unsat cores not supported without proof support'
314 )
315 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
316
317 for req_feature in requires:
318 if req_feature.startswith("no-"):
319 inv_feature = req_feature[len("no-"):]
320 if inv_feature in cvc4_features:
321 print('1..0 # Skipped regression: not valid with {}'.format(
322 inv_feature))
323 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
324 elif req_feature not in cvc4_features:
325 print('1..0 # Skipped regression: {} not supported'.format(
326 req_feature))
327 return (EXIT_SKIP if use_skip_return_code else EXIT_OK)
328
329 if not command_lines:
330 command_lines.append('')
331
332 command_line_args_configs = []
333 for command_line in command_lines:
334 args = shlex.split(command_line)
335 all_args = basic_command_line_args + args
336
337 if not unsat_cores and ('--check-unsat-cores' in all_args):
338 print(
339 '# Skipped command line options ({}): unsat cores not supported without proof support'
340 .format(all_args))
341 continue
342 if not proofs and '--dump-proofs' in all_args:
343 print(
344 '# Skipped command line options ({}): proof production not supported without LFSC support'
345 .format(all_args))
346 continue
347
348 command_line_args_configs.append(all_args)
349
350 extra_command_line_args = []
351 if benchmark_ext == '.sy' and \
352 '--no-check-synth-sol' not in all_args and \
353 '--sygus-rr' not in all_args and \
354 '--check-synth-sol' not in all_args:
355 extra_command_line_args = ['--check-synth-sol']
356 if re.search(r'^(sat|invalid|unknown)$', expected_output) and \
357 '--no-debug-check-models' not in all_args and \
358 '--no-check-models' not in all_args and \
359 '--debug-check-models' not in all_args:
360 extra_command_line_args = ['--debug-check-models']
361 if unsat_cores and re.search(r'^(unsat|valid)$', expected_output):
362 if '--no-check-unsat-cores' not in all_args and \
363 '--check-unsat-cores' not in all_args and \
364 '--incremental' not in all_args and \
365 '--unconstrained-simp' not in all_args:
366 extra_command_line_args += ['--check-unsat-cores']
367 if '--no-check-abducts' not in all_args and \
368 '--check-abducts' not in all_args:
369 extra_command_line_args += ['--check-abducts']
370 if extra_command_line_args:
371 command_line_args_configs.append(all_args +
372 extra_command_line_args)
373
374 # Run CVC4 on the benchmark with the different option sets and check
375 # whether the exit status, stdout output, stderr output are as expected.
376 print('1..{}'.format(len(command_line_args_configs)))
377 print('# Starting')
378 exit_code = EXIT_OK
379 for command_line_args in command_line_args_configs:
380 output, error, exit_status = run_benchmark(dump, wrapper, scrubber,
381 error_scrubber, cvc4_binary,
382 command_line_args,
383 benchmark_dir,
384 benchmark_basename, timeout)
385 output = re.sub(r'^[ \t]*', '', output, flags=re.MULTILINE)
386 error = re.sub(r'^[ \t]*', '', error, flags=re.MULTILINE)
387 if exit_status == STATUS_TIMEOUT:
388 exit_code = EXIT_SKIP if skip_timeout else EXIT_FAILURE
389 print('Timeout - Flags: {}'.format(command_line_args))
390 elif output != expected_output:
391 exit_code = EXIT_FAILURE
392 print('not ok - Flags: {}'.format(command_line_args))
393 print()
394 print('Standard output difference')
395 print('=' * 80)
396 print_diff(output, expected_output)
397 print('=' * 80)
398 print()
399 print()
400 if error:
401 print('Error output')
402 print('=' * 80)
403 print_colored(Color.YELLOW, error)
404 print('=' * 80)
405 print()
406 elif error != expected_error:
407 exit_code = EXIT_FAILURE
408 print(
409 'not ok - Differences between expected and actual output on stderr - Flags: {}'
410 .format(command_line_args))
411 print()
412 print('Error output difference')
413 print('=' * 80)
414 print_diff(error, expected_error)
415 print('=' * 80)
416 print()
417 elif expected_exit_status != exit_status:
418 exit_code = EXIT_FAILURE
419 print(
420 'not ok - Expected exit status "{}" but got "{}" - Flags: {}'.
421 format(expected_exit_status, exit_status, command_line_args))
422 print()
423 print('Output:')
424 print('=' * 80)
425 print_colored(Color.BLUE, output)
426 print('=' * 80)
427 print()
428 print()
429 print('Error output:')
430 print('=' * 80)
431 print_colored(Color.YELLOW, error)
432 print('=' * 80)
433 print()
434 else:
435 print('ok - Flags: {}'.format(command_line_args))
436
437 return exit_code
438
439
440 def main():
441 """Parses the command line arguments and then calls the core of the
442 script."""
443
444 parser = argparse.ArgumentParser(
445 description=
446 'Runs benchmark and checks for correct exit status and output.')
447 parser.add_argument('--enable-proof', action='store_true')
448 parser.add_argument('--with-lfsc', action='store_true')
449 parser.add_argument('--dump', action='store_true')
450 parser.add_argument('--use-skip-return-code', action='store_true')
451 parser.add_argument('--skip-timeout', action='store_true')
452 parser.add_argument('wrapper', nargs='*')
453 parser.add_argument('cvc4_binary')
454 parser.add_argument('benchmark')
455 args = parser.parse_args()
456 cvc4_binary = os.path.abspath(args.cvc4_binary)
457
458 wrapper = args.wrapper
459 if os.environ.get('VALGRIND') == '1' and not wrapper:
460 wrapper = ['libtool', '--mode=execute', 'valgrind']
461
462 timeout = float(os.getenv('TEST_TIMEOUT', 600.0))
463
464 return run_regression(args.enable_proof, args.with_lfsc, args.dump,
465 args.use_skip_return_code, args.skip_timeout,
466 wrapper, cvc4_binary, args.benchmark, timeout)
467
468
469 if __name__ == "__main__":
470 exit_code = main()
471 sys.exit(exit_code)