1 /********************* */
2 /*! \file regexp_elim.cpp
4 ** Top contributors (to current version):
6 ** This file is part of the CVC4 project.
7 ** Copyright (c) 2009-2018 by the authors listed in the file AUTHORS
8 ** in the top-level source directory) and their institutional affiliations.
9 ** All rights reserved. See the file COPYING in the top-level source
10 ** directory for licensing information.\endverbatim
12 ** \brief Implementation of techniques for eliminating regular expressions
16 #include "theory/strings/regexp_elim.h"
18 #include "options/strings_options.h"
19 #include "theory/strings/theory_strings_rewriter.h"
22 using namespace CVC4::kind
;
23 using namespace CVC4::theory
;
24 using namespace CVC4::theory::strings
;
26 RegExpElimination::RegExpElimination()
28 d_zero
= NodeManager::currentNM()->mkConst(Rational(0));
29 d_one
= NodeManager::currentNM()->mkConst(Rational(1));
30 d_neg_one
= NodeManager::currentNM()->mkConst(Rational(-1));
33 Node
RegExpElimination::eliminate(Node atom
)
35 Assert(atom
.getKind() == STRING_IN_REGEXP
);
36 if (atom
[1].getKind() == REGEXP_CONCAT
)
38 return eliminateConcat(atom
);
40 else if (atom
[1].getKind() == REGEXP_STAR
)
42 return eliminateStar(atom
);
47 Node
RegExpElimination::eliminateConcat(Node atom
)
49 NodeManager
* nm
= NodeManager::currentNM();
51 Node lenx
= nm
->mkNode(STRING_LENGTH
, x
);
53 // memberships of the form x in re.++ * s1 * ... * sn *, where * are
54 // any number of repetitions (exact or indefinite) of re.allchar.
55 Trace("re-elim-debug") << "Try re concat with gaps " << atom
<< std::endl
;
56 std::vector
<Node
> children
;
57 TheoryStringsRewriter::getConcat(re
, children
);
58 bool onlySigmasAndConsts
= true;
59 std::vector
<Node
> sep_children
;
60 std::vector
<unsigned> gap_minsize
;
61 std::vector
<bool> gap_exact
;
62 // the first gap is initially strict zero
63 gap_minsize
.push_back(0);
64 gap_exact
.push_back(true);
65 for (const Node
& c
: children
)
67 Trace("re-elim-debug") << " " << c
<< std::endl
;
68 onlySigmasAndConsts
= false;
69 if (c
.getKind() == STRING_TO_REGEXP
)
71 onlySigmasAndConsts
= true;
72 sep_children
.push_back(c
[0]);
73 // the next gap is initially strict zero
74 gap_minsize
.push_back(0);
75 gap_exact
.push_back(true);
77 else if (c
.getKind() == REGEXP_STAR
&& c
[0].getKind() == REGEXP_SIGMA
)
79 // found a gap of any size
80 onlySigmasAndConsts
= true;
81 gap_exact
[gap_exact
.size() - 1] = false;
83 else if (c
.getKind() == REGEXP_SIGMA
)
85 // add one to the minimum size of the gap
86 onlySigmasAndConsts
= true;
87 gap_minsize
[gap_minsize
.size() - 1]++;
89 if (!onlySigmasAndConsts
)
91 Trace("re-elim-debug") << "...cannot handle " << c
<< std::endl
;
95 // we should always rewrite concatenations that are purely re.allchar
96 // and re.*( re.allchar ).
97 Assert(!onlySigmasAndConsts
|| !sep_children
.empty());
98 if (onlySigmasAndConsts
&& !sep_children
.empty())
100 bool canProcess
= true;
101 std::vector
<Node
> conj
;
102 // The following constructs a set of constraints that encodes that a
103 // set of string terms are found, in order, in string x.
104 // prev_end stores the current (symbolic) index in x that we are
106 Node prev_end
= d_zero
;
107 unsigned gap_minsize_end
= gap_minsize
.back();
108 bool gap_exact_end
= gap_exact
.back();
109 std::vector
<Node
> non_greedy_find_vars
;
110 for (unsigned i
= 0, size
= sep_children
.size(); i
< size
; i
++)
112 Node sc
= sep_children
[i
];
113 if (gap_minsize
[i
] > 0)
115 // the gap to this child is at least gap_minsize[i]
117 nm
->mkNode(PLUS
, prev_end
, nm
->mkConst(Rational(gap_minsize
[i
])));
119 Node lensc
= nm
->mkNode(STRING_LENGTH
, sc
);
122 // if the gap is exact, it is a substring constraint
123 Node curr
= prev_end
;
124 Node ss
= nm
->mkNode(STRING_SUBSTR
, x
, curr
, lensc
);
125 conj
.push_back(ss
.eqNode(sc
));
126 prev_end
= nm
->mkNode(PLUS
, curr
, lensc
);
130 // otherwise, we can use indexof to represent some next occurrence
131 if (gap_exact
[i
+ 1] && i
+ 1 != size
)
133 if (!options::regExpElimAgg())
138 // if the gap after this one is strict, we need a non-greedy find
139 // thus, we add a symbolic constant
140 Node k
= nm
->mkBoundVar(nm
->integerType());
141 non_greedy_find_vars
.push_back(k
);
142 prev_end
= nm
->mkNode(PLUS
, prev_end
, k
);
144 Node curr
= nm
->mkNode(STRING_STRIDOF
, x
, sc
, prev_end
);
145 Node idofFind
= curr
.eqNode(d_neg_one
).negate();
146 conj
.push_back(idofFind
);
147 prev_end
= nm
->mkNode(PLUS
, curr
, lensc
);
153 // since sep_children is non-empty, conj is non-empty
154 Assert(!conj
.empty());
155 // Process the last gap, if necessary.
156 // Notice that if the last gap is not exact and its minsize is zero,
157 // then the last indexof/substr constraint entails the following
158 // constraint, so it is not necessary to add.
159 if (gap_minsize_end
> 0 || gap_exact_end
)
161 Node fit
= nm
->mkNode(
162 gap_exact_end
? EQUAL
: LEQ
,
163 nm
->mkNode(PLUS
, prev_end
, nm
->mkConst(Rational(gap_minsize_end
))),
167 Node res
= conj
.size() == 1 ? conj
[0] : nm
->mkNode(AND
, conj
);
168 // process the non-greedy find variables
169 if (!non_greedy_find_vars
.empty())
171 std::vector
<Node
> children
;
172 for (const Node
& v
: non_greedy_find_vars
)
174 Node bound
= nm
->mkNode(
175 AND
, nm
->mkNode(LEQ
, d_zero
, v
), nm
->mkNode(LT
, v
, lenx
));
176 children
.push_back(bound
);
178 children
.push_back(res
);
179 Node body
= nm
->mkNode(AND
, children
);
180 Node bvl
= nm
->mkNode(BOUND_VAR_LIST
, non_greedy_find_vars
);
181 res
= nm
->mkNode(EXISTS
, bvl
, body
);
183 // e.g., writing "A" for (str.to.re "A") and _ for re.allchar:
184 // x in (re.++ "A" (re.* _) "B" (re.* _)) --->
185 // substr(x,0,1)="A" ^ indexof(x,"B",1)!=-1
186 // x in (re.++ (re.* _) "A" _ _ _ (re.* _) "B" _ _ (re.* _)) --->
187 // indexof(x,"A",0)!=-1 ^
188 // indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 ) != -1 ^
189 // indexof( x, "B", indexof( x, "A", 0 ) + 1 + 3 )+1+2 <= len(x)
191 // An example of a non-greedy find:
192 // x in re.++( re.*( _ ), "A", _, "B", re.*( _ ) ) --->
193 // exists k. 0 <= k < len( x ) ^
194 // indexof( x, "A", k ) != -1 ^
195 // substr( x, indexof( x, "A", k )+2, 1 ) = "B"
196 return returnElim(atom
, res
, "concat-with-gaps");
200 if (!options::regExpElimAgg())
204 // only aggressive rewrites below here
206 // if the first or last child is constant string, we can split the membership
207 // into a conjunction of two memberships.
208 Node sStartIndex
= d_zero
;
210 std::vector
<Node
> sConstraints
;
211 std::vector
<Node
> rexpElimChildren
;
212 unsigned nchildren
= children
.size();
213 Assert(nchildren
> 1);
214 for (unsigned r
= 0; r
< 2; r
++)
216 unsigned index
= r
== 0 ? 0 : nchildren
- 1;
217 Assert(children
[index
+ (r
== 0 ? 1 : -1)].getKind() != STRING_TO_REGEXP
);
218 Node c
= children
[index
];
219 if (c
.getKind() == STRING_TO_REGEXP
)
222 Node lens
= nm
->mkNode(STRING_LENGTH
, s
);
223 Node sss
= r
== 0 ? d_zero
: nm
->mkNode(MINUS
, lenx
, lens
);
224 Node ss
= nm
->mkNode(STRING_SUBSTR
, x
, sss
, lens
);
225 sConstraints
.push_back(ss
.eqNode(s
));
230 sLength
= nm
->mkNode(MINUS
, sLength
, lens
);
232 if (r
== 1 && !sConstraints
.empty())
234 // add the middle children
235 for (unsigned i
= 1; i
< (nchildren
- 1); i
++)
237 rexpElimChildren
.push_back(children
[i
]);
240 if (c
.getKind() != STRING_TO_REGEXP
)
242 rexpElimChildren
.push_back(c
);
245 Assert(rexpElimChildren
.size() + sConstraints
.size() == nchildren
);
246 if (!sConstraints
.empty())
248 Node ss
= nm
->mkNode(STRING_SUBSTR
, x
, sStartIndex
, sLength
);
249 Assert(!rexpElimChildren
.empty());
251 TheoryStringsRewriter::mkConcat(REGEXP_CONCAT
, rexpElimChildren
);
252 sConstraints
.push_back(nm
->mkNode(STRING_IN_REGEXP
, ss
, regElim
));
253 Node ret
= nm
->mkNode(AND
, sConstraints
);
255 // x in re.++( "A", R ) ---> substr(x,0,1)="A" ^ substr(x,1,len(x)-1) in R
256 return returnElim(atom
, ret
, "concat-splice");
258 Assert(nchildren
> 1);
259 for (unsigned i
= 0; i
< nchildren
; i
++)
261 if (children
[i
].getKind() == STRING_TO_REGEXP
)
263 Node s
= children
[i
][0];
264 Node lens
= nm
->mkNode(STRING_LENGTH
, s
);
265 // there exists an index in this string such that the substring is this
267 std::vector
<Node
> echildren
;
272 else if (i
+ 1 == nchildren
)
274 k
= nm
->mkNode(MINUS
, lenx
, lens
);
278 k
= nm
->mkBoundVar(nm
->integerType());
281 nm
->mkNode(LEQ
, d_zero
, k
),
282 nm
->mkNode(LT
, k
, nm
->mkNode(MINUS
, lenx
, lens
)));
283 echildren
.push_back(bound
);
285 Node substrEq
= nm
->mkNode(STRING_SUBSTR
, x
, k
, lens
).eqNode(s
);
286 echildren
.push_back(substrEq
);
289 std::vector
<Node
> rprefix
;
290 rprefix
.insert(rprefix
.end(), children
.begin(), children
.begin() + i
);
291 Node rpn
= TheoryStringsRewriter::mkConcat(REGEXP_CONCAT
, rprefix
);
292 Node substrPrefix
= nm
->mkNode(
293 STRING_IN_REGEXP
, nm
->mkNode(STRING_SUBSTR
, x
, d_zero
, k
), rpn
);
294 echildren
.push_back(substrPrefix
);
296 if (i
+ 1 < nchildren
)
298 std::vector
<Node
> rsuffix
;
299 rsuffix
.insert(rsuffix
.end(), children
.begin() + i
+ 1, children
.end());
300 Node rps
= TheoryStringsRewriter::mkConcat(REGEXP_CONCAT
, rsuffix
);
301 Node ks
= nm
->mkNode(PLUS
, k
, lens
);
302 Node substrSuffix
= nm
->mkNode(
304 nm
->mkNode(STRING_SUBSTR
, x
, ks
, nm
->mkNode(MINUS
, lenx
, ks
)),
306 echildren
.push_back(substrSuffix
);
308 Node body
= nm
->mkNode(AND
, echildren
);
309 if (k
.getKind() == BOUND_VARIABLE
)
311 Node bvl
= nm
->mkNode(BOUND_VAR_LIST
, k
);
312 body
= nm
->mkNode(EXISTS
, bvl
, body
);
314 // e.g. x in re.++( R1, "AB", R2 ) --->
316 // 0 <= k <= (len(x)-2) ^
317 // substr( x, k, 2 ) = "AB" ^
318 // substr( x, 0, k ) in R1 ^
319 // substr( x, k+2, len(x)-(k+2) ) in R2
320 return returnElim(atom
, body
, "concat-find");
326 Node
RegExpElimination::eliminateStar(Node atom
)
328 if (!options::regExpElimAgg())
332 // only aggressive rewrites below here
334 NodeManager
* nm
= NodeManager::currentNM();
336 Node lenx
= nm
->mkNode(STRING_LENGTH
, x
);
338 // for regular expression star,
339 // if the period is a fixed constant, we can turn it into a bounded
341 std::vector
<Node
> disj
;
342 if (re
[0].getKind() == REGEXP_UNION
)
344 for (const Node
& r
: re
[0])
351 disj
.push_back(re
[0]);
353 bool lenOnePeriod
= true;
354 std::vector
<Node
> char_constraints
;
355 Node index
= nm
->mkBoundVar(nm
->integerType());
356 Node substr_ch
= nm
->mkNode(STRING_SUBSTR
, x
, index
, d_one
);
357 substr_ch
= Rewriter::rewrite(substr_ch
);
358 // handle the case where it is purely characters
359 for (const Node
& r
: disj
)
361 Assert(r
.getKind() != REGEXP_UNION
);
362 Assert(r
.getKind() != REGEXP_SIGMA
);
363 lenOnePeriod
= false;
364 // lenOnePeriod is true if this regular expression is a single character
365 // regular expression
366 if (r
.getKind() == STRING_TO_REGEXP
)
369 if (s
.isConst() && s
.getConst
<String
>().size() == 1)
374 else if (r
.getKind() == REGEXP_RANGE
)
384 Node regexp_ch
= nm
->mkNode(STRING_IN_REGEXP
, substr_ch
, r
);
385 regexp_ch
= Rewriter::rewrite(regexp_ch
);
386 Assert(regexp_ch
.getKind() != STRING_IN_REGEXP
);
387 char_constraints
.push_back(regexp_ch
);
392 Assert(!char_constraints
.empty());
393 Node bound
= nm
->mkNode(
394 AND
, nm
->mkNode(LEQ
, d_zero
, index
), nm
->mkNode(LT
, index
, lenx
));
395 Node conc
= char_constraints
.size() == 1 ? char_constraints
[0]
396 : nm
->mkNode(OR
, char_constraints
);
397 Node body
= nm
->mkNode(OR
, bound
.negate(), conc
);
398 Node bvl
= nm
->mkNode(BOUND_VAR_LIST
, index
);
399 Node res
= nm
->mkNode(FORALL
, bvl
, body
);
401 // x in (re.* (re.union "A" "B" )) --->
402 // forall k. 0<=k<len(x) => (substr(x,k,1) in "A" OR substr(x,k,1) in "B")
403 return returnElim(atom
, res
, "star-char");
405 // otherwise, for stars of constant length these are periodic
406 if (disj
.size() == 1)
409 if (r
.getKind() == STRING_TO_REGEXP
)
414 Node lens
= nm
->mkNode(STRING_LENGTH
, s
);
415 lens
= Rewriter::rewrite(lens
);
416 Assert(lens
.isConst());
417 std::vector
<Node
> conj
;
418 Node bound
= nm
->mkNode(
420 nm
->mkNode(LEQ
, d_zero
, index
),
421 nm
->mkNode(LT
, index
, nm
->mkNode(INTS_DIVISION
, lenx
, lens
)));
423 nm
->mkNode(STRING_SUBSTR
, x
, nm
->mkNode(MULT
, index
, lens
), lens
)
425 Node body
= nm
->mkNode(OR
, bound
.negate(), conc
);
426 Node bvl
= nm
->mkNode(BOUND_VAR_LIST
, index
);
427 Node res
= nm
->mkNode(FORALL
, bvl
, body
);
429 AND
, nm
->mkNode(INTS_MODULUS
, lenx
, lens
).eqNode(d_zero
), res
);
431 // x in ("abc")* --->
432 // forall k. 0 <= k < (len( x ) div 3) => substr(x,3*k,3) = "abc" ^
434 return returnElim(atom
, res
, "star-constant");
441 Node
RegExpElimination::returnElim(Node atom
, Node atomElim
, const char* id
)
443 Trace("re-elim") << "re-elim: " << atom
<< " to " << atomElim
<< " by " << id