8ea26fca955a573456fb1b2c6c99b62b367a37db
[cvc5.git] / src / theory / strings / regexp_elim.cpp
1 /********************* */
2 /*! \file regexp_elim.cpp
3 ** \verbatim
4 ** Top contributors (to current version):
5 ** Andrew Reynolds
6 ** This file is part of the CVC4 project.
7 ** Copyright (c) 2009-2018 by the authors listed in the file AUTHORS
8 ** in the top-level source directory) and their institutional affiliations.
9 ** All rights reserved. See the file COPYING in the top-level source
10 ** directory for licensing information.\endverbatim
11 **
12 ** \brief Implementation of techniques for eliminating regular expressions
13 **
14 **/
15
16 #include "theory/strings/regexp_elim.h"
17
18 #include "options/strings_options.h"
19 #include "theory/strings/theory_strings_rewriter.h"
20
21 using namespace CVC4;
22 using namespace CVC4::kind;
23 using namespace CVC4::theory;
24 using namespace CVC4::theory::strings;
25
26 RegExpElimination::RegExpElimination()
27 {
28 d_zero = NodeManager::currentNM()->mkConst(Rational(0));
29 d_one = NodeManager::currentNM()->mkConst(Rational(1));
30 d_neg_one = NodeManager::currentNM()->mkConst(Rational(-1));
31 }
32
33 Node RegExpElimination::eliminate(Node atom)
34 {
35 Assert(atom.getKind() == STRING_IN_REGEXP);
36 if (atom[1].getKind() == REGEXP_CONCAT)
37 {
38 return eliminateConcat(atom);
39 }
40 else if (atom[1].getKind() == REGEXP_STAR)
41 {
42 return eliminateStar(atom);
43 }
44 return Node::null();
45 }
46
47 Node RegExpElimination::eliminateConcat(Node atom)
48 {
49 NodeManager* nm = NodeManager::currentNM();
50 Node x = atom[0];
51 Node lenx = nm->mkNode(STRING_LENGTH, x);
52 Node re = atom[1];
53 // memberships of the form x in re.++ * s1 * ... * sn *, where * are
54 // any number of repetitions (exact or indefinite) of re.allchar.
55 Trace("re-elim-debug") << "Try re concat with gaps " << atom << std::endl;
56 std::vector<Node> children;
57 TheoryStringsRewriter::getConcat(re, children);
58 bool onlySigmasAndConsts = true;
59 std::vector<Node> sep_children;
60 std::vector<unsigned> gap_minsize;
61 std::vector<bool> gap_exact;
62 // the first gap is initially strict zero
63 gap_minsize.push_back(0);
64 gap_exact.push_back(true);
65 for (const Node& c : children)
66 {
67 Trace("re-elim-debug") << " " << c << std::endl;
68 onlySigmasAndConsts = false;
69 if (c.getKind() == STRING_TO_REGEXP)
70 {
71 onlySigmasAndConsts = true;
72 sep_children.push_back(c[0]);
73 // the next gap is initially strict zero
74 gap_minsize.push_back(0);
75 gap_exact.push_back(true);
76 }
77 else if (c.getKind() == REGEXP_STAR && c[0].getKind() == REGEXP_SIGMA)
78 {
79 // found a gap of any size
80 onlySigmasAndConsts = true;
81 gap_exact[gap_exact.size() - 1] = false;
82 }
83 else if (c.getKind() == REGEXP_SIGMA)
84 {
85 // add one to the minimum size of the gap
86 onlySigmasAndConsts = true;
87 gap_minsize[gap_minsize.size() - 1]++;
88 }
89 if (!onlySigmasAndConsts)
90 {
91 Trace("re-elim-debug") << "...cannot handle " << c << std::endl;
92 break;
93 }
94 }
95 // we should always rewrite concatenations that are purely re.allchar
96 // and re.*( re.allchar ).
97 Assert(!onlySigmasAndConsts || !sep_children.empty());
98 if (onlySigmasAndConsts && !sep_children.empty())
99 {
100 bool canProcess = true;
101 std::vector<Node> conj;
102 // The following constructs a set of constraints that encodes that a
103 // set of string terms are found, in order, in string x.
104 // prev_end stores the current (symbolic) index in x that we are
105 // searching.
106 Node prev_end = d_zero;
107 unsigned gap_minsize_end = gap_minsize.back();
108 bool gap_exact_end = gap_exact.back();
109 std::vector<Node> non_greedy_find_vars;
110 for (unsigned i = 0, size = sep_children.size(); i < size; i++)
111 {
112 Node sc = sep_children[i];
113 if (gap_minsize[i] > 0)
114 {
115 // the gap to this child is at least gap_minsize[i]
116 prev_end =
117 nm->mkNode(PLUS, prev_end, nm->mkConst(Rational(gap_minsize[i])));
118 }
119 Node lensc = nm->mkNode(STRING_LENGTH, sc);
120 if (gap_exact[i])
121 {
122 // if the gap is exact, it is a substring constraint
123 Node curr = prev_end;
124 Node ss = nm->mkNode(STRING_SUBSTR, x, curr, lensc);
125 conj.push_back(ss.eqNode(sc));
126 prev_end = nm->mkNode(PLUS, curr, lensc);
127 }
128 else
129 {
130 // otherwise, we can use indexof to represent some next occurrence
131 if (gap_exact[i + 1] && i + 1 != size)
132 {
133 if (!options::regExpElimAgg())
134 {
135 canProcess = false;
136 break;
137 }
138 // if the gap after this one is strict, we need a non-greedy find
139 // thus, we add a symbolic constant
140 Node k = nm->mkBoundVar(nm->integerType());
141 non_greedy_find_vars.push_back(k);
142 prev_end = nm->mkNode(PLUS, prev_end, k);
143 }
144 Node curr = nm->mkNode(STRING_STRIDOF, x, sc, prev_end);
145 Node idofFind = curr.eqNode(d_neg_one).negate();
146 conj.push_back(idofFind);
147 prev_end = nm->mkNode(PLUS, curr, lensc);
148 }
149 }
150
151 if (canProcess)
152 {
153 // since sep_children is non-empty, conj is non-empty
154 Assert(!conj.empty());
155 // Process the last gap, if necessary.
156 // Notice that if the last gap is not exact and its minsize is zero,
157 // then the last indexof/substr constraint entails the following
158 // constraint, so it is not necessary to add.
159 // Below, we may write "A" for (str.to.re "A") and _ for re.allchar:
160 Node cEnd = nm->mkConst(Rational(gap_minsize_end));
161 if (gap_exact_end)
162 {
163 Assert(!sep_children.empty());
164 // if it is strict, it corresponds to a substr case.
165 // For example:
166 // x in (re.++ "A" (re.* _) "B" _ _) --->
167 // ... ^ "B" = substr( x, len( x ) - 3, 1 ) ^ ...
168 Node sc = sep_children.back();
169 Node lenSc = nm->mkNode(STRING_LENGTH, sc);
170 Node loc = nm->mkNode(MINUS, lenx, nm->mkNode(PLUS, lenSc, cEnd));
171 Node scc = sc.eqNode(nm->mkNode(STRING_SUBSTR, x, loc, lenSc));
172 conj.push_back(scc);
173 // We also must ensure that we fit. This constraint is necessary in
174 // addition to the constraint above. Take this example:
175 // x in (re.++ "A" _ (re.* _) "B" _) --->
176 // substr( x, 0, 1 ) = "A" ^ // find "A"
177 // indexof( x, "B", 2 ) != -1 ^ // find "B" >=1 after "A"
178 // substr( x, len(x)-2, 1 ) = "B" ^ // "B" is at end - 2.
179 // indexof( x, "B", 2 ) <= len( x ) - 2
180 // The last constaint ensures that the second and third constraints
181 // may refer to the same "B". If it were not for the last constraint, it
182 // would have been the case than "ABB" would be a model for x, where
183 // the second constraint refers to the third position, and the third
184 // constraint refers to the second position.
185 Node fit = nm->mkNode(gap_exact[sep_children.size() - 1] ? EQUAL : LEQ,
186 nm->mkNode(MINUS, prev_end, lenSc),
187 loc);
188 conj.push_back(fit);
189 }
190 else if (gap_minsize_end > 0)
191 {
192 // if it is non-strict, we are in a "greedy find" situtation where
193 // we just need to ensure that the next occurrence fits.
194 // For example:
195 // x in (re.++ "A" (re.* _) "B" _ _ (re.* _)) --->
196 // ... ^ indexof( x, "B", 1 ) + 2 <= len( x )
197 Node fit = nm->mkNode(LEQ, nm->mkNode(PLUS, prev_end, cEnd), lenx);
198 conj.push_back(fit);
199 }
200 Node res = conj.size() == 1 ? conj[0] : nm->mkNode(AND, conj);
201 // process the non-greedy find variables
202 if (!non_greedy_find_vars.empty())
203 {
204 std::vector<Node> children;
205 for (const Node& v : non_greedy_find_vars)
206 {
207 Node bound = nm->mkNode(
208 AND, nm->mkNode(LEQ, d_zero, v), nm->mkNode(LT, v, lenx));
209 children.push_back(bound);
210 }
211 children.push_back(res);
212 Node body = nm->mkNode(AND, children);
213 Node bvl = nm->mkNode(BOUND_VAR_LIST, non_greedy_find_vars);
214 res = nm->mkNode(EXISTS, bvl, body);
215 }
216 // Examples of this elimination:
217 // x in (re.++ "A" (re.* _) "B" (re.* _)) --->
218 // substr(x,0,1)="A" ^ indexof(x,"B",1)!=-1
219 // x in (re.++ (re.* _) "A" _ _ _ (re.* _) "B" _ _ (re.* _)) --->
220 // indexof(x,"A",0)!=-1 ^
221 // indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 ) != -1 ^
222 // indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 )+1+2 <= len(x)
223
224 // An example of a non-greedy find:
225 // x in re.++( re.*( _ ), "A", _, "B", re.*( _ ) ) --->
226 // exists k. 0 <= k < len( x ) ^
227 // indexof( x, "A", k ) != -1 ^
228 // substr( x, indexof( x, "A", k )+2, 1 ) = "B"
229 return returnElim(atom, res, "concat-with-gaps");
230 }
231 }
232
233 if (!options::regExpElimAgg())
234 {
235 return Node::null();
236 }
237 // only aggressive rewrites below here
238
239 // if the first or last child is constant string, we can split the membership
240 // into a conjunction of two memberships.
241 Node sStartIndex = d_zero;
242 Node sLength = lenx;
243 std::vector<Node> sConstraints;
244 std::vector<Node> rexpElimChildren;
245 unsigned nchildren = children.size();
246 Assert(nchildren > 1);
247 for (unsigned r = 0; r < 2; r++)
248 {
249 unsigned index = r == 0 ? 0 : nchildren - 1;
250 Assert(children[index + (r == 0 ? 1 : -1)].getKind() != STRING_TO_REGEXP);
251 Node c = children[index];
252 if (c.getKind() == STRING_TO_REGEXP)
253 {
254 Node s = c[0];
255 Node lens = nm->mkNode(STRING_LENGTH, s);
256 Node sss = r == 0 ? d_zero : nm->mkNode(MINUS, lenx, lens);
257 Node ss = nm->mkNode(STRING_SUBSTR, x, sss, lens);
258 sConstraints.push_back(ss.eqNode(s));
259 if (r == 0)
260 {
261 sStartIndex = lens;
262 }
263 sLength = nm->mkNode(MINUS, sLength, lens);
264 }
265 if (r == 1 && !sConstraints.empty())
266 {
267 // add the middle children
268 for (unsigned i = 1; i < (nchildren - 1); i++)
269 {
270 rexpElimChildren.push_back(children[i]);
271 }
272 }
273 if (c.getKind() != STRING_TO_REGEXP)
274 {
275 rexpElimChildren.push_back(c);
276 }
277 }
278 Assert(rexpElimChildren.size() + sConstraints.size() == nchildren);
279 if (!sConstraints.empty())
280 {
281 Node ss = nm->mkNode(STRING_SUBSTR, x, sStartIndex, sLength);
282 Assert(!rexpElimChildren.empty());
283 Node regElim =
284 TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rexpElimChildren);
285 sConstraints.push_back(nm->mkNode(STRING_IN_REGEXP, ss, regElim));
286 Node ret = nm->mkNode(AND, sConstraints);
287 // e.g.
288 // x in re.++( "A", R ) ---> substr(x,0,1)="A" ^ substr(x,1,len(x)-1) in R
289 return returnElim(atom, ret, "concat-splice");
290 }
291 Assert(nchildren > 1);
292 for (unsigned i = 0; i < nchildren; i++)
293 {
294 if (children[i].getKind() == STRING_TO_REGEXP)
295 {
296 Node s = children[i][0];
297 Node lens = nm->mkNode(STRING_LENGTH, s);
298 // there exists an index in this string such that the substring is this
299 Node k;
300 std::vector<Node> echildren;
301 if (i == 0)
302 {
303 k = d_zero;
304 }
305 else if (i + 1 == nchildren)
306 {
307 k = nm->mkNode(MINUS, lenx, lens);
308 }
309 else
310 {
311 k = nm->mkBoundVar(nm->integerType());
312 Node bound =
313 nm->mkNode(AND,
314 nm->mkNode(LEQ, d_zero, k),
315 nm->mkNode(LT, k, nm->mkNode(MINUS, lenx, lens)));
316 echildren.push_back(bound);
317 }
318 Node substrEq = nm->mkNode(STRING_SUBSTR, x, k, lens).eqNode(s);
319 echildren.push_back(substrEq);
320 if (i > 0)
321 {
322 std::vector<Node> rprefix;
323 rprefix.insert(rprefix.end(), children.begin(), children.begin() + i);
324 Node rpn = TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rprefix);
325 Node substrPrefix = nm->mkNode(
326 STRING_IN_REGEXP, nm->mkNode(STRING_SUBSTR, x, d_zero, k), rpn);
327 echildren.push_back(substrPrefix);
328 }
329 if (i + 1 < nchildren)
330 {
331 std::vector<Node> rsuffix;
332 rsuffix.insert(rsuffix.end(), children.begin() + i + 1, children.end());
333 Node rps = TheoryStringsRewriter::mkConcat(REGEXP_CONCAT, rsuffix);
334 Node ks = nm->mkNode(PLUS, k, lens);
335 Node substrSuffix = nm->mkNode(
336 STRING_IN_REGEXP,
337 nm->mkNode(STRING_SUBSTR, x, ks, nm->mkNode(MINUS, lenx, ks)),
338 rps);
339 echildren.push_back(substrSuffix);
340 }
341 Node body = nm->mkNode(AND, echildren);
342 if (k.getKind() == BOUND_VARIABLE)
343 {
344 Node bvl = nm->mkNode(BOUND_VAR_LIST, k);
345 body = nm->mkNode(EXISTS, bvl, body);
346 }
347 // e.g. x in re.++( R1, "AB", R2 ) --->
348 // exists k.
349 // 0 <= k <= (len(x)-2) ^
350 // substr( x, k, 2 ) = "AB" ^
351 // substr( x, 0, k ) in R1 ^
352 // substr( x, k+2, len(x)-(k+2) ) in R2
353 return returnElim(atom, body, "concat-find");
354 }
355 }
356 return Node::null();
357 }
358
359 Node RegExpElimination::eliminateStar(Node atom)
360 {
361 if (!options::regExpElimAgg())
362 {
363 return Node::null();
364 }
365 // only aggressive rewrites below here
366
367 NodeManager* nm = NodeManager::currentNM();
368 Node x = atom[0];
369 Node lenx = nm->mkNode(STRING_LENGTH, x);
370 Node re = atom[1];
371 // for regular expression star,
372 // if the period is a fixed constant, we can turn it into a bounded
373 // quantifier
374 std::vector<Node> disj;
375 if (re[0].getKind() == REGEXP_UNION)
376 {
377 for (const Node& r : re[0])
378 {
379 disj.push_back(r);
380 }
381 }
382 else
383 {
384 disj.push_back(re[0]);
385 }
386 bool lenOnePeriod = true;
387 std::vector<Node> char_constraints;
388 Node index = nm->mkBoundVar(nm->integerType());
389 Node substr_ch = nm->mkNode(STRING_SUBSTR, x, index, d_one);
390 substr_ch = Rewriter::rewrite(substr_ch);
391 // handle the case where it is purely characters
392 for (const Node& r : disj)
393 {
394 Assert(r.getKind() != REGEXP_UNION);
395 Assert(r.getKind() != REGEXP_SIGMA);
396 lenOnePeriod = false;
397 // lenOnePeriod is true if this regular expression is a single character
398 // regular expression
399 if (r.getKind() == STRING_TO_REGEXP)
400 {
401 Node s = r[0];
402 if (s.isConst() && s.getConst<String>().size() == 1)
403 {
404 lenOnePeriod = true;
405 }
406 }
407 else if (r.getKind() == REGEXP_RANGE)
408 {
409 lenOnePeriod = true;
410 }
411 if (!lenOnePeriod)
412 {
413 break;
414 }
415 else
416 {
417 Node regexp_ch = nm->mkNode(STRING_IN_REGEXP, substr_ch, r);
418 regexp_ch = Rewriter::rewrite(regexp_ch);
419 Assert(regexp_ch.getKind() != STRING_IN_REGEXP);
420 char_constraints.push_back(regexp_ch);
421 }
422 }
423 if (lenOnePeriod)
424 {
425 Assert(!char_constraints.empty());
426 Node bound = nm->mkNode(
427 AND, nm->mkNode(LEQ, d_zero, index), nm->mkNode(LT, index, lenx));
428 Node conc = char_constraints.size() == 1 ? char_constraints[0]
429 : nm->mkNode(OR, char_constraints);
430 Node body = nm->mkNode(OR, bound.negate(), conc);
431 Node bvl = nm->mkNode(BOUND_VAR_LIST, index);
432 Node res = nm->mkNode(FORALL, bvl, body);
433 // e.g.
434 // x in (re.* (re.union "A" "B" )) --->
435 // forall k. 0<=k<len(x) => (substr(x,k,1) in "A" OR substr(x,k,1) in "B")
436 return returnElim(atom, res, "star-char");
437 }
438 // otherwise, for stars of constant length these are periodic
439 if (disj.size() == 1)
440 {
441 Node r = disj[0];
442 if (r.getKind() == STRING_TO_REGEXP)
443 {
444 Node s = r[0];
445 if (s.isConst())
446 {
447 Node lens = nm->mkNode(STRING_LENGTH, s);
448 lens = Rewriter::rewrite(lens);
449 Assert(lens.isConst());
450 std::vector<Node> conj;
451 Node bound = nm->mkNode(
452 AND,
453 nm->mkNode(LEQ, d_zero, index),
454 nm->mkNode(LT, index, nm->mkNode(INTS_DIVISION, lenx, lens)));
455 Node conc =
456 nm->mkNode(STRING_SUBSTR, x, nm->mkNode(MULT, index, lens), lens)
457 .eqNode(s);
458 Node body = nm->mkNode(OR, bound.negate(), conc);
459 Node bvl = nm->mkNode(BOUND_VAR_LIST, index);
460 Node res = nm->mkNode(FORALL, bvl, body);
461 res = nm->mkNode(
462 AND, nm->mkNode(INTS_MODULUS, lenx, lens).eqNode(d_zero), res);
463 // e.g.
464 // x in ("abc")* --->
465 // forall k. 0 <= k < (len( x ) div 3) => substr(x,3*k,3) = "abc" ^
466 // len(x) mod 3 = 0
467 return returnElim(atom, res, "star-constant");
468 }
469 }
470 }
471 return Node::null();
472 }
473
474 Node RegExpElimination::returnElim(Node atom, Node atomElim, const char* id)
475 {
476 Trace("re-elim") << "re-elim: " << atom << " to " << atomElim << " by " << id
477 << "." << std::endl;
478 return atomElim;
479 }