16643332a6fc489d6a41ff01ba1016277e03e938
[cvc5.git] / src / theory / strings / strings_entail.h
1 /********************* */
2 /*! \file strings_entail.h
3 ** \verbatim
4 ** Top contributors (to current version):
5 ** Andrew Reynolds, Andres Noetzli
6 ** This file is part of the CVC4 project.
7 ** Copyright (c) 2009-2021 by the authors listed in the file AUTHORS
8 ** in the top-level source directory and their institutional affiliations.
9 ** All rights reserved. See the file COPYING in the top-level source
10 ** directory for licensing information.\endverbatim
11 **
12 ** \brief Entailment tests involving strings
13 **/
14
15 #include "cvc4_private.h"
16
17 #ifndef CVC4__THEORY__STRINGS__STRING_ENTAIL_H
18 #define CVC4__THEORY__STRINGS__STRING_ENTAIL_H
19
20 #include <vector>
21
22 #include "expr/node.h"
23
24 namespace CVC5 {
25 namespace theory {
26 namespace strings {
27
28 class SequencesRewriter;
29
30 /**
31 * Entailment tests involving strings.
32 * Some of these techniques are described in Reynolds et al, "High Level
33 * Abstractions for Simplifying Extended String Constraints in SMT", CAV 2019.
34 */
35 class StringsEntail
36 {
37 public:
38 StringsEntail(SequencesRewriter& rewriter);
39
40 /** can constant contain list
41 * return true if constant c can contain the list l in order
42 * firstc/lastc store which indices in l were used to determine the return
43 * value.
44 * (This is typically used when this function returns false, for minimizing
45 * explanations)
46 *
47 * For example:
48 * canConstantContainList( "abc", { x, "c", y } ) returns true
49 * firstc/lastc are updated to 1/1
50 * canConstantContainList( "abc", { x, "d", y } ) returns false
51 * firstc/lastc are updated to 1/1
52 * canConstantContainList( "abcdef", { x, "b", y, "a", z, "c", w }
53 * returns false
54 * firstc/lastc are updated to 1/3
55 * canConstantContainList( "abcdef", { x, "b", y, "e", z, "c", w }
56 * returns false
57 * firstc/lastc are updated to 1/5
58 */
59 static bool canConstantContainList(Node c,
60 std::vector<Node>& l,
61 int& firstc,
62 int& lastc);
63 /** can constant contain concat
64 * same as above but with n = str.++( l ) instead of l
65 */
66 static bool canConstantContainConcat(Node c, Node n, int& firstc, int& lastc);
67
68 /** strip symbolic length
69 *
70 * This function strips off components of n1 whose length is less than or
71 * equal to argument curr, and stores them in nr. The direction dir
72 * determines whether the components are removed from the start or end of n1.
73 * If `strict` is set to true, then the function only returns true if full
74 * length `curr` can be stripped.
75 *
76 * In detail, this function updates n1 to n1' such that:
77 * If dir=1,
78 * n1 = str.++( nr, n1' )
79 * If dir=-1
80 * n1 = str.++( n1', nr )
81 * It updates curr to curr' such that:
82 * curr' = curr - str.len( str.++( nr ) ), and
83 * curr' >= 0
84 * where the latter fact is determined by checkArithEntail.
85 *
86 * This function returns true if n1 is modified.
87 *
88 * For example:
89 *
90 * stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)+1 )
91 * returns true
92 * n1 is updated to { "bc", y }
93 * nr is updated to { x, "a" }
94 * curr is updated to 0 *
95 *
96 * stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)-1 )
97 * returns false
98 *
99 * stripSymbolicLength( { y, "abc", x }, {}, 1, str.len(x)+1 )
100 * returns false
101 *
102 * stripSymbolicLength( { x, "abc", y }, {}, -1, 2*str.len(y)+4 )
103 * returns true
104 * n1 is updated to { x }
105 * nr is updated to { "abc", y }
106 * curr is updated to str.len(y)+1
107 */
108 static bool stripSymbolicLength(std::vector<Node>& n1,
109 std::vector<Node>& nr,
110 int dir,
111 Node& curr,
112 bool strict = false);
113 /** component contains
114 * This function is used when rewriting str.contains( t1, t2 ), where
115 * n1 is the vector form of t1
116 * n2 is the vector form of t2
117 *
118 * If this function returns n>=0 for some n, then
119 * n1 = { x1...x{n-1} xn...x{n+s} x{n+s+1}...xm },
120 * n2 = { y1...ys },
121 * y1 is a suffix of xn,
122 * y2...y{s-1} = x{n+1}...x{n+s-1}, and
123 * ys is a prefix of x{n+s}
124 * Otherwise it returns -1.
125 *
126 * This function may update n1 if computeRemainder = true.
127 * We maintain the invariant that the resulting value n1'
128 * of n1 after this function is such that:
129 * n1 = str.++( nb, n1', ne )
130 * The vectors nb and ne have the following properties.
131 * If computeRemainder = true, then
132 * If remainderDir != -1, then
133 * ne is { x{n+s}' x{n+s+1}...xm }
134 * where x{n+s} = str.++( ys, x{n+s}' ).
135 * If remainderDir != 1, then
136 * nb is { x1, ..., x{n-1}, xn' }
137 * where xn = str.++( xn', y1 ).
138 *
139 * For example:
140 *
141 * componentContains({ x, "abc", x }, { "b" }, {}, true, 0)
142 * returns 1,
143 * n1 is updated to { "b" },
144 * nb is updated to { x, "a" },
145 * ne is updated to { "c", x }
146 *
147 * componentContains({ x, "abc", x }, { "b" }, {}, true, 1)
148 * returns 1,
149 * n1 is updated to { x, "ab" },
150 * ne is updated to { "c", x }
151 *
152 * componentContains({ y, z, "abc", x, "def" }, { "c", x, "de" }, {}, true, 1)
153 * returns 2,
154 * n1 is updated to { y, z, "abc", x, "de" },
155 * ne is updated to { "f" }
156 *
157 * componentContains({ y, "abc", x, "def" }, { "c", x, "de" }, {}, true, -1)
158 * returns 1,
159 * n1 is updated to { "c", x, "def" },
160 * nb is updated to { y, "ab" }
161 */
162 int componentContains(std::vector<Node>& n1,
163 std::vector<Node>& n2,
164 std::vector<Node>& nb,
165 std::vector<Node>& ne,
166 bool computeRemainder = false,
167 int remainderDir = 0);
168 /** strip constant endpoints
169 * This function is used when rewriting str.contains( t1, t2 ), where
170 * n1 is the vector form of t1
171 * n2 is the vector form of t2
172 *
173 * It modifies n1 to a new vector n1' such that:
174 * (1) str.contains( str.++( n1 ), str.++( n2 ) ) is equivalent to
175 * str.contains( str.++( n1' ), str.++( n2 ) )
176 * (2) str.++( n1 ) = str.++( nb, n1', ne )
177 *
178 * "dir" is the direction in which we can modify n1:
179 * if dir = 1, then we allow dropping components from the front of n1,
180 * if dir = -1, then we allow dropping components from the back of n1,
181 * if dir = 0, then we allow dropping components from either.
182 *
183 * It returns true if n1 is modified.
184 *
185 * For example:
186 * stripConstantEndpoints({ "ab", x, "de" }, { "c" }, {}, {}, 1)
187 * returns true,
188 * n1 is updated to { x, "de" }
189 * nb is updated to { "ab" }
190 * stripConstantEndpoints({ "ab", x, "de" }, { "bd" }, {}, {}, 0)
191 * returns true,
192 * n1 is updated to { "b", x, "d" }
193 * nb is updated to { "a" }
194 * ne is updated to { "e" }
195 * stripConstantEndpoints({ "ad", substr("ccc",x,y) }, { "d" }, {}, {}, -1)
196 * returns true,
197 * n1 is updated to {"ad"}
198 * ne is updated to { substr("ccc",x,y) }
199 */
200 static bool stripConstantEndpoints(std::vector<Node>& n1,
201 std::vector<Node>& n2,
202 std::vector<Node>& nb,
203 std::vector<Node>& ne,
204 int dir = 0);
205
206 /**
207 * Checks whether a string term `a` is entailed to contain or not contain a
208 * string term `b`.
209 *
210 * @param a The string that is checked whether it contains `b`
211 * @param b The string that is checked whether it is contained in `a`
212 * @param fullRewriter Determines whether the function can use the full
213 * rewriter or only `rewriteContains()` (useful for avoiding loops)
214 * @return true node if it can be shown that `a` contains `b`, false node if
215 * it can be shown that `a` does not contain `b`, null node otherwise
216 */
217 Node checkContains(Node a, Node b, bool fullRewriter = true);
218
219 /** entail non-empty
220 *
221 * Checks whether string a is entailed to be non-empty. Is equivalent to
222 * the call checkArithEntail( len( a ), true ).
223 */
224 static bool checkNonEmpty(Node a);
225
226 /**
227 * Checks whether string has at most/exactly length one. Length one strings
228 * can be used for more aggressive rewriting because there is guaranteed that
229 * it cannot be overlap multiple components in a string concatenation.
230 *
231 * @param s The string to check
232 * @param strict If true, the string must have exactly length one, otherwise
233 * at most length one
234 * @return True if the string has at most/exactly length one, false otherwise
235 */
236 static bool checkLengthOne(Node s, bool strict = false);
237
238 /**
239 * Checks whether it is always true that `a` is a strict subset of `b` in the
240 * multiset domain.
241 *
242 * Examples:
243 *
244 * a = (str.++ "A" x), b = (str.++ "A" x "B") ---> true
245 * a = (str.++ "A" x), b = (str.++ "B" x "AA") ---> true
246 * a = (str.++ "A" x), b = (str.++ "B" y "AA") ---> false
247 *
248 * @param a The term for which it should be checked if it is a strict subset
249 * of `b` in the multiset domain
250 * @param b The term for which it should be checked if it is a strict
251 * superset of `a` in the multiset domain
252 * @return True if it is always the case that `a` is a strict subset of `b`,
253 * false otherwise.
254 */
255 static bool checkMultisetSubset(Node a, Node b);
256
257 /**
258 * Returns a character `c` if it is always the case that str.in.re(a, c*),
259 * i.e. if all possible values of `a` only consist of `c` characters, and the
260 * null node otherwise. If `a` is the empty string, the function returns an
261 * empty string.
262 *
263 * @param a The node to check for homogeneity
264 * @return If `a` is homogeneous, the only character that it may contain, the
265 * empty string if `a` is empty, and the null node otherwise
266 */
267 static Node checkHomogeneousString(Node a);
268
269 /**
270 * Overapproximates the possible values of node n. This overapproximation
271 * assumes that n can return a value x or the empty string and tries to find
272 * the simplest x such that this holds. In the general case, x is the same as
273 * the input n. This overapproximation can be used to sort terms with the
274 * same possible values in string concatenation for example.
275 *
276 * Example:
277 *
278 * getStringOrEmpty( (str.replace "" x y) ) --> y because (str.replace "" x y)
279 * either returns y or ""
280 *
281 * getStringOrEmpty( (str.substr "ABC" x y) ) --> (str.substr "ABC" x y)
282 * because the function could not compute a simpler
283 */
284 static Node getStringOrEmpty(Node n);
285
286 /**
287 * Infers a conjunction of equalities that correspond to (str.contains x y)
288 * if it can show that the length of y is greater or equal to the length of
289 * x. If y is a concatentation, we get x = y1 ++ ... ++ yn, the conjunction
290 * is of the form:
291 *
292 * (and (= x (str.++ y1' ... ym')) (= y1'' "") ... (= yk'' ""))
293 *
294 * where each yi'' are yi that must be empty for (= x y) to hold and yi' are
295 * yi that the function could not infer anything about. Returns a null node
296 * if the function cannot infer that str.len(y) >= str.len(x). Returns (= x
297 * y) if the function can infer that str.len(y) >= str.len(x) but cannot
298 * infer that any of the yi must be empty.
299 */
300 static Node inferEqsFromContains(Node x, Node y);
301
302 private:
303 /** component contains base
304 *
305 * This function is a helper for the above function.
306 *
307 * It returns true if n2 is contained in n1 with the following
308 * restrictions:
309 * If dir=1, then n2 must be a suffix of n1.
310 * If dir=-1, then n2 must be a prefix of n1.
311 *
312 * If computeRemainder is true, then n1rb and n1re are
313 * updated such that :
314 * n1 = str.++( n1rb, n2, n1re )
315 * where a null value of n1rb and n1re indicates the
316 * empty string.
317 *
318 * For example:
319 *
320 * componentContainsBase("cabe", "ab", n1rb, n1re, 1, false)
321 * returns false.
322 *
323 * componentContainsBase("cabe", "ab", n1rb, n1re, 0, true)
324 * returns true,
325 * n1rb is set to "c",
326 * n1re is set to "e".
327 *
328 * componentContainsBase(y, str.substr(y,0,5), n1rb, n1re, -1, true)
329 * returns true,
330 * n1re is set to str.substr(y,5,str.len(y)).
331 *
332 *
333 * Notice that this function may return false when it cannot compute a
334 * remainder when it otherwise would have returned true. For example:
335 *
336 * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, false)
337 * returns true.
338 *
339 * Hence, we know that str.substr(y,x,z) is contained in y. However:
340 *
341 * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, true)
342 * returns false.
343 *
344 * The reason is since computeRemainder=true, it must be that
345 * y = str.++( n1rb, str.substr(y,x,z), n1re )
346 * for some n1rb, n1re. However, to construct such n1rb, n1re would require
347 * e.g. the terms:
348 * y = str.++( ite( x+z < 0 OR x < 0, "", str.substr(y,0,x) ),
349 * str.substr(y,x,z),
350 * ite( x+z < 0 OR x < 0, y, str.substr(y,x+z,len(y)) ) )
351 *
352 * Since we do not wish to introduce ITE terms in the rewriter, we instead
353 * return false, indicating that we cannot compute the remainder.
354 */
355 bool componentContainsBase(
356 Node n1, Node n2, Node& n1rb, Node& n1re, int dir, bool computeRemainder);
357 /**
358 * Simplifies a given node `a` s.t. the result is a concatenation of string
359 * terms that can be interpreted as a multiset and which contains all
360 * multisets that `a` could form.
361 *
362 * Examples:
363 *
364 * (str.substr "AA" 0 n) ---> "AA"
365 * (str.replace "AAA" x "BB") ---> (str.++ "AAA" "BB")
366 *
367 * @param a The node to simplify
368 * @return A concatenation that can be interpreted as a multiset
369 */
370 static Node getMultisetApproximation(Node a);
371
372 private:
373 /**
374 * Reference to the sequences rewriter that owns this `StringsEntail`
375 * instance.
376 */
377 SequencesRewriter& d_rewriter;
378 };
379
380 } // namespace strings
381 } // namespace theory
382 } // namespace CVC5
383
384 #endif /* CVC4__THEORY__STRINGS__STRING_ENTAIL_H */