From 77e98815254c68301ffcd7fb8addeb6751c51187 Mon Sep 17 00:00:00 2001 From: Andrew Reynolds Date: Thu, 6 Aug 2020 06:48:45 -0500 Subject: [PATCH] (proof-new) Refactor regular expression operation (#4596) This refactors the regular expression operation class so that some of its key methods are static, so that they can used by both the regular expression solver and the strings proof checker. Notice that many cases of regular expression unfolding are deleted by this PR, since they are unnecessary. This is due to the fact that all regular expression memberships are rewritten except those whose RHS are re.++ or re.*. --- src/theory/strings/regexp_operation.cpp | 888 +++++++++------------- src/theory/strings/regexp_operation.h | 86 ++- src/theory/strings/regexp_solver.cpp | 22 +- src/theory/strings/regexp_solver.h | 6 +- src/theory/strings/theory_strings.cpp | 3 +- test/unit/theory/regexp_operation_black.h | 7 +- 6 files changed, 456 insertions(+), 556 deletions(-) diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp index 017d861a2..273518edd 100644 --- a/src/theory/strings/regexp_operation.cpp +++ b/src/theory/strings/regexp_operation.cpp @@ -16,20 +16,20 @@ #include "theory/strings/regexp_operation.h" -#include "expr/kind.h" +#include "expr/node_algorithm.h" #include "options/strings_options.h" +#include "theory/rewriter.h" #include "theory/strings/regexp_entail.h" #include "theory/strings/theory_strings_utils.h" #include "theory/strings/word.h" -using namespace CVC4; using namespace CVC4::kind; namespace CVC4 { namespace theory { namespace strings { -RegExpOpr::RegExpOpr() +RegExpOpr::RegExpOpr(SkolemCache* sc) : d_true(NodeManager::currentNM()->mkConst(true)), d_false(NodeManager::currentNM()->mkConst(false)), d_emptyRegexp(NodeManager::currentNM()->mkNode(kind::REGEXP_EMPTY, @@ -38,7 +38,9 @@ RegExpOpr::RegExpOpr() d_one(NodeManager::currentNM()->mkConst(::CVC4::Rational(1))), d_sigma(NodeManager::currentNM()->mkNode(kind::REGEXP_SIGMA, std::vector{})), - d_sigma_star(NodeManager::currentNM()->mkNode(kind::REGEXP_STAR, d_sigma)) + d_sigma_star( + NodeManager::currentNM()->mkNode(kind::REGEXP_STAR, d_sigma)), + d_sc(sc) { d_emptyString = Word::mkEmptyWord(NodeManager::currentNM()->stringType()); @@ -821,444 +823,250 @@ void RegExpOpr::firstChars(Node r, std::set &pcset, SetNodes &pvset) } } -//simplify -void RegExpOpr::simplify(Node t, std::vector< Node > &new_nodes, bool polarity) { - Trace("strings-regexp-simpl") << "RegExp-Simpl starts with " << t << ", polarity=" << polarity << std::endl; +Node RegExpOpr::simplify(Node t, bool polarity) +{ + Trace("strings-regexp-simpl") + << "RegExpOpr::simplify: " << t << ", polarity=" << polarity << std::endl; Assert(t.getKind() == kind::STRING_IN_REGEXP); - Node str = t[0]; - Node re = t[1]; - if(polarity) { - simplifyPRegExp( str, re, new_nodes ); - } else { - simplifyNRegExp( str, re, new_nodes ); + Node tlit = polarity ? t : t.notNode(); + Node conc; + std::map::const_iterator itr = d_simpCache.find(tlit); + if (itr != d_simpCache.end()) + { + return itr->second; } - Trace("strings-regexp-simpl") << "RegExp-Simpl returns (" << new_nodes.size() << "):\n"; - for(unsigned i=0; i &new_nodes ) { - std::pair < Node, Node > p(s, r); - NodeManager *nm = NodeManager::currentNM(); - std::map < std::pair< Node, Node >, Node >::const_iterator itr = d_simpl_neg_cache.find(p); - if(itr != d_simpl_neg_cache.end()) { - new_nodes.push_back( itr->second ); - } else { - Kind k = r.getKind(); - Node conc; - switch( k ) { - case kind::REGEXP_EMPTY: { - conc = d_true; - break; - } - case kind::REGEXP_SIGMA: { - conc = d_one.eqNode(NodeManager::currentNM()->mkNode(kind::STRING_LENGTH, s)).negate(); - break; - } - case kind::REGEXP_RANGE: { - std::vector< Node > vec; - unsigned a = r[0].getConst().front(); - unsigned b = r[1].getConst().front(); - for (unsigned c = a; c <= b; c++) - { - std::vector tmpVec; - tmpVec.push_back(c); - Node tmp = s.eqNode(nm->mkConst(String(tmpVec))).negate(); - vec.push_back( tmp ); - } - conc = vec.size()==1? vec[0] : NodeManager::currentNM()->mkNode(kind::AND, vec); - break; - } - case kind::STRING_TO_REGEXP: { - conc = s.eqNode(r[0]).negate(); - break; - } - case kind::REGEXP_CONCAT: { - // The following simplification states that - // ~( s in R1 ++ R2 ) - // is equivalent to - // forall x. - // 0 <= x <= len(s) => - // ~( substr(s,0,x) in R1 ) OR ~( substr(s,x,len(s)-x) in R2) - Node lens = nm->mkNode(STRING_LENGTH, s); - // the index we are removing from the RE concatenation - unsigned indexRm = 0; - Node b1; - Node b1v; - // As an optimization to the above reduction, if we can determine that - // all strings in the language of R1 have the same length, say n, - // then the conclusion of the reduction is quantifier-free: - // ~( substr(s,0,n) in R1 ) OR ~( substr(s,n,len(s)-n) in R2) - Node reLength = RegExpEntail::getFixedLengthForRegexp(r[0]); - if (reLength.isNull()) - { - // try from the opposite end - unsigned indexE = r.getNumChildren() - 1; - reLength = RegExpEntail::getFixedLengthForRegexp(r[indexE]); - if (!reLength.isNull()) - { - indexRm = indexE; - } - } - Node guard; - if (reLength.isNull()) - { - b1 = nm->mkBoundVar(nm->integerType()); - b1v = nm->mkNode(BOUND_VAR_LIST, b1); - guard = nm->mkNode(AND, - nm->mkNode(GEQ, b1, d_zero), - nm->mkNode(GEQ, nm->mkNode(STRING_LENGTH, s), b1)); - } - else - { - b1 = reLength; - } - Node s1; - Node s2; - if (indexRm == 0) - { - s1 = nm->mkNode(STRING_SUBSTR, s, d_zero, b1); - s2 = nm->mkNode(STRING_SUBSTR, s, b1, nm->mkNode(MINUS, lens, b1)); - } - else - { - s1 = nm->mkNode(STRING_SUBSTR, s, nm->mkNode(MINUS, lens, b1), b1); - s2 = - nm->mkNode(STRING_SUBSTR, s, d_zero, nm->mkNode(MINUS, lens, b1)); - } - Node s1r1 = nm->mkNode(STRING_IN_REGEXP, s1, r[indexRm]).negate(); - std::vector nvec; - for (unsigned i = 0, nchild = r.getNumChildren(); i < nchild; i++) - { - if (i != indexRm) - { - nvec.push_back( r[i] ); - } - } - Node r2 = nvec.size() == 1 ? nvec[0] : nm->mkNode(REGEXP_CONCAT, nvec); - r2 = Rewriter::rewrite(r2); - Node s2r2 = nm->mkNode(STRING_IN_REGEXP, s2, r2).negate(); - conc = nm->mkNode(OR, s1r1, s2r2); - if (!b1v.isNull()) - { - conc = nm->mkNode(OR, guard.negate(), conc); - conc = nm->mkNode(FORALL, b1v, conc); - } - break; - } - case kind::REGEXP_UNION: { - std::vector< Node > c_and; - for(unsigned i=0; imkNode(kind::STRING_IN_REGEXP, s, r[i]).negate()); - } - } - conc = c_and.size() == 0 ? d_true : - c_and.size() == 1 ? c_and[0] : NodeManager::currentNM()->mkNode(kind::AND, c_and); - break; - } - case kind::REGEXP_INTER: { - bool emptyflag = false; - std::vector< Node > c_or; - for(unsigned i=0; imkNode(kind::STRING_IN_REGEXP, s, r[i]).negate()); - } - } - if(emptyflag) { - conc = d_true; - } else { - conc = c_or.size() == 1 ? c_or[0] : NodeManager::currentNM()->mkNode(kind::OR, c_or); - } - break; - } - case kind::REGEXP_STAR: { - if(s == d_emptyString) { - conc = d_false; - } else if(r[0].getKind() == kind::REGEXP_EMPTY) { - conc = s.eqNode(d_emptyString).negate(); - } else if(r[0].getKind() == kind::REGEXP_SIGMA) { - conc = d_false; - } else { - Node lens = NodeManager::currentNM()->mkNode(kind::STRING_LENGTH, s); - Node sne = s.eqNode(d_emptyString).negate(); - Node b1 = NodeManager::currentNM()->mkBoundVar(NodeManager::currentNM()->integerType()); - Node b1v = NodeManager::currentNM()->mkNode(kind::BOUND_VAR_LIST, b1); - Node g1 = NodeManager::currentNM()->mkNode( kind::AND, NodeManager::currentNM()->mkNode(kind::GEQ, b1, d_one), - NodeManager::currentNM()->mkNode( kind::GEQ, lens, b1 ) ); - //internal - Node s1 = NodeManager::currentNM()->mkNode(kind::STRING_SUBSTR, s, d_zero, b1); - Node s2 = NodeManager::currentNM()->mkNode(kind::STRING_SUBSTR, s, b1, NodeManager::currentNM()->mkNode(kind::MINUS, lens, b1)); - Node s1r1 = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, s1, r[0]).negate(); - Node s2r2 = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, s2, r).negate(); - - conc = NodeManager::currentNM()->mkNode(kind::OR, s1r1, s2r2); - conc = NodeManager::currentNM()->mkNode(kind::IMPLIES, g1, conc); - conc = NodeManager::currentNM()->mkNode(kind::FORALL, b1v, conc); - conc = NodeManager::currentNM()->mkNode(kind::AND, sne, conc); - } - break; - } - case kind::REGEXP_LOOP: { - Assert(r.getNumChildren() == 3); - if(r[1] == r[2]) { - if(r[1] == d_zero) { - conc = s.eqNode(d_emptyString).negate(); - } else if(r[1] == d_one) { - conc = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, s, r[0]).negate(); - } else { - //unroll for now - unsigned l = r[1].getConst().getNumerator().toUnsignedInt(); - std::vector vec; - for(unsigned i=0; imkNode(kind::REGEXP_CONCAT, vec); - conc = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, s, r2).negate(); - } - } else { - Assert(r[1] == d_zero); - //unroll for now - unsigned u = r[2].getConst().getNumerator().toUnsignedInt(); - std::vector vec; - vec.push_back(d_emptySingleton); - std::vector vec2; - for(unsigned i=1; i<=u; i++) { - vec2.push_back(r[0]); - Node r2 = i==1? r[0] : NodeManager::currentNM()->mkNode(kind::REGEXP_CONCAT, vec); - vec.push_back(r2); - } - Node r3 = NodeManager::currentNM()->mkNode(kind::REGEXP_UNION, vec); - conc = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, s, r3).negate(); - } - break; - } - case kind::REGEXP_COMPLEMENT: + else + { + // see if we can use an optimized version of the reduction for re.++. + Node r = t[1]; + if (r.getKind() == REGEXP_CONCAT) + { + // the index we are removing from the RE concatenation + size_t index = 0; + // As an optimization to the reduction, if we can determine that + // all strings in the language of R1 have the same length, say n, + // then the conclusion of the reduction is quantifier-free: + // ~( substr(s,0,n) in R1 ) OR ~( substr(s,n,len(s)-n) in R2) + Node reLen = getRegExpConcatFixed(r, index); + if (!reLen.isNull()) { - // ~( s in complement(R) ) ---> s in R - conc = nm->mkNode(STRING_IN_REGEXP, s, r[0]); - break; - } - default: { - Assert(!utils::isRegExpKind(k)); - break; + conc = reduceRegExpNegConcatFixed(tlit, reLen, index); } } - if (!conc.isNull()) + if (conc.isNull()) { - conc = Rewriter::rewrite(conc); - new_nodes.push_back(conc); - d_simpl_neg_cache[p] = conc; + conc = reduceRegExpNeg(tlit); } } + d_simpCache[tlit] = conc; + Trace("strings-regexp-simpl") + << "RegExpOpr::simplify: returns " << conc << std::endl; + return conc; } -void RegExpOpr::simplifyPRegExp( Node s, Node r, std::vector< Node > &new_nodes ) { - std::pair < Node, Node > p(s, r); - NodeManager *nm = NodeManager::currentNM(); - std::map < std::pair< Node, Node >, Node >::const_iterator itr = d_simpl_cache.find(p); - if(itr != d_simpl_cache.end()) { - new_nodes.push_back( itr->second ); - } else { - Kind k = r.getKind(); - Node conc; - switch( k ) { - case kind::REGEXP_EMPTY: { - conc = d_false; - break; - } - case kind::REGEXP_SIGMA: { - conc = d_one.eqNode(NodeManager::currentNM()->mkNode(kind::STRING_LENGTH, s)); - break; - } - case kind::REGEXP_RANGE: { - conc = s.eqNode( r[0] ); - if(r[0] != r[1]) { - unsigned a = r[0].getConst().front(); - unsigned b = r[1].getConst().front(); - a += 1; - std::vector anvec; - anvec.push_back(a); - Node an = nm->mkConst(String(anvec)); - Node tmp = a != b - ? nm->mkNode(kind::STRING_IN_REGEXP, - s, - nm->mkNode(kind::REGEXP_RANGE, an, r[1])) - : s.eqNode(r[1]); - conc = NodeManager::currentNM()->mkNode(kind::OR, conc, tmp); - } - break; - } - case kind::STRING_TO_REGEXP: { - conc = s.eqNode(r[0]); - break; - } - case kind::REGEXP_CONCAT: { - std::vector< Node > nvec; - std::vector< Node > cc; - bool emptyflag = false; - for(unsigned i=0; imkSkolem( "rc", s.getType(), "created for regular expression concat" ); - Node lem = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, sk, r[i]); - nvec.push_back(lem); - cc.push_back(sk); - } - } - if(emptyflag) { - conc = d_false; - } else { - Node lem = s.eqNode( NodeManager::currentNM()->mkNode(kind::STRING_CONCAT, cc) ); - nvec.push_back(lem); - conc = nvec.size() == 1 ? nvec[0] : NodeManager::currentNM()->mkNode(kind::AND, nvec); - } - break; - } - case kind::REGEXP_UNION: { - std::vector< Node > c_or; - for(unsigned i=0; imkNode(kind::STRING_IN_REGEXP, s, r[i])); - } - } - conc = c_or.size() == 0 ? d_false : - c_or.size() == 1 ? c_or[0] : NodeManager::currentNM()->mkNode(kind::OR, c_or); - break; - } - case kind::REGEXP_INTER: { - std::vector< Node > c_and; - bool emptyflag = false; - for(unsigned i=0; imkNode(kind::STRING_IN_REGEXP, s, r[i])); - } - } - if(emptyflag) { - conc = d_false; - } else { - conc = c_and.size() == 1 ? c_and[0] : NodeManager::currentNM()->mkNode(kind::AND, c_and); - } - break; - } - case kind::REGEXP_STAR: { - if(s == d_emptyString) { - conc = d_true; - } else if(r[0].getKind() == kind::REGEXP_EMPTY) { - conc = s.eqNode(d_emptyString); - } else if(r[0].getKind() == kind::REGEXP_SIGMA) { - conc = d_true; - } else { - Node se = s.eqNode(d_emptyString); - Node sinr = nm->mkNode(kind::STRING_IN_REGEXP, s, r[0]); - Node sk1 = nm->mkSkolem( - "rs", s.getType(), "created for regular expression star"); - Node sk2 = nm->mkSkolem( - "rs", s.getType(), "created for regular expression star"); - Node sk3 = nm->mkSkolem( - "rs", s.getType(), "created for regular expression star"); - NodeBuilder<> nb(kind::AND); - nb << sk1.eqNode(d_emptyString).negate(); - nb << sk3.eqNode(d_emptyString).negate(); - nb << nm->mkNode(kind::STRING_IN_REGEXP, sk1, r[0]); - nb << nm->mkNode(kind::STRING_IN_REGEXP, sk2, r); - nb << nm->mkNode(kind::STRING_IN_REGEXP, sk3, r[0]); - nb << s.eqNode(nm->mkNode(kind::STRING_CONCAT, sk1, sk2, sk3)); - conc = nb; +Node RegExpOpr::getRegExpConcatFixed(Node r, size_t& index) +{ + Assert(r.getKind() == REGEXP_CONCAT); + index = 0; + Node reLen = RegExpEntail::getFixedLengthForRegexp(r[0]); + if (!reLen.isNull()) + { + return reLen; + } + // try from the opposite end + size_t indexE = r.getNumChildren() - 1; + reLen = RegExpEntail::getFixedLengthForRegexp(r[indexE]); + if (!reLen.isNull()) + { + index = indexE; + return reLen; + } + return Node::null(); +} - // We unfold `x in R*` by considering three cases: `x` is empty, `x` - // is matched by `R`, or `x` is matched by two or more `R`s. For the - // last case, we break `x` into three pieces, making the beginning - // and the end each match `R` and the middle match `R*`. Matching the - // beginning and the end with `R` allows us to reason about the - // beginning and the end of `x` simultaneously. - // - // x in R* ---> (x = "") v (x in R) v - // (x = x1 ++ x2 ++ x3 ^ x1 != "" ^ x3 != "" ^ - // x1 in R ^ x2 in R* ^ x3 in R) - conc = nm->mkNode(kind::OR, se, sinr, conc); - } - break; - } - case kind::REGEXP_LOOP: { - Assert(r.getNumChildren() == 3); - if(r[1] == d_zero) { - if(r[2] == d_zero) { - conc = s.eqNode( d_emptyString ); - } else { - //R{0,n} - if(s != d_emptyString) { - Node sk1 = NodeManager::currentNM()->mkSkolem( "lps", s.getType(), "created for regular expression loop" ); - Node sk2 = NodeManager::currentNM()->mkSkolem( "lps", s.getType(), "created for regular expression loop" ); - Node seq12 = s.eqNode(NodeManager::currentNM()->mkNode(kind::STRING_CONCAT, sk1, sk2)); - Node sk1ne = sk1.eqNode(d_emptyString).negate(); - Node sk1inr = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, sk1, r[0]); - unsigned u = r[2].getConst().getNumerator().toUnsignedInt(); - Node u1 = NodeManager::currentNM()->mkConst(CVC4::Rational(u - 1)); - Node sk2inru = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, sk2, - NodeManager::currentNM()->mkNode(kind::REGEXP_LOOP, r[0], d_zero, u1)); - conc = NodeManager::currentNM()->mkNode(kind::AND, seq12, sk1ne, sk1inr, sk2inru); - conc = NodeManager::currentNM()->mkNode(kind::OR, - s.eqNode(d_emptyString), conc); - } else { - conc = d_true; - } - } - } else { - //R^n - Node sk1 = NodeManager::currentNM()->mkSkolem( "lps", s.getType(), "created for regular expression loop" ); - Node sk2 = NodeManager::currentNM()->mkSkolem( "lps", s.getType(), "created for regular expression loop" ); - Node seq12 = s.eqNode(NodeManager::currentNM()->mkNode(kind::STRING_CONCAT, sk1, sk2)); - Node sk1ne = sk1.eqNode(d_emptyString).negate(); - Node sk1inr = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, sk1, r[0]); - unsigned u = r[2].getConst().getNumerator().toUnsignedInt(); - Node u1 = NodeManager::currentNM()->mkConst(CVC4::Rational(u - 1)); - Node sk2inru = NodeManager::currentNM()->mkNode(kind::STRING_IN_REGEXP, sk2, - NodeManager::currentNM()->mkNode(kind::REGEXP_LOOP, r[0], u1, u1)); - conc = NodeManager::currentNM()->mkNode(kind::AND, seq12, sk1ne, sk1inr, sk2inru); - } - break; - } - case kind::REGEXP_COMPLEMENT: +Node RegExpOpr::reduceRegExpNeg(Node mem) +{ + Assert(mem.getKind() == NOT && mem[0].getKind() == STRING_IN_REGEXP); + Node s = mem[0][0]; + Node r = mem[0][1]; + NodeManager* nm = NodeManager::currentNM(); + Kind k = r.getKind(); + Node zero = nm->mkConst(Rational(0)); + Node conc; + if (k == REGEXP_CONCAT) + { + // do not use length entailment, call regular expression concat + Node reLen; + size_t i = 0; + conc = reduceRegExpNegConcatFixed(mem, reLen, i); + } + else if (k == REGEXP_STAR) + { + Node emp = Word::mkEmptyWord(s.getType()); + Node lens = nm->mkNode(STRING_LENGTH, s); + Node sne = s.eqNode(emp).negate(); + Node b1 = nm->mkBoundVar(nm->integerType()); + Node b1v = nm->mkNode(BOUND_VAR_LIST, b1); + Node g1 = + nm->mkNode(AND, nm->mkNode(GT, b1, zero), nm->mkNode(GEQ, lens, b1)); + // internal + Node s1 = nm->mkNode(STRING_SUBSTR, s, zero, b1); + Node s2 = nm->mkNode(STRING_SUBSTR, s, b1, nm->mkNode(MINUS, lens, b1)); + Node s1r1 = nm->mkNode(STRING_IN_REGEXP, s1, r[0]).negate(); + Node s2r2 = nm->mkNode(STRING_IN_REGEXP, s2, r).negate(); + + conc = nm->mkNode(OR, s1r1, s2r2); + conc = nm->mkNode(IMPLIES, g1, conc); + conc = nm->mkNode(FORALL, b1v, conc); + conc = nm->mkNode(AND, sne, conc); + } + else + { + Assert(!utils::isRegExpKind(k)); + } + return conc; +} + +Node RegExpOpr::reduceRegExpNegConcatFixed(Node mem, Node reLen, size_t index) +{ + Assert(mem.getKind() == NOT && mem[0].getKind() == STRING_IN_REGEXP); + Node s = mem[0][0]; + Node r = mem[0][1]; + NodeManager* nm = NodeManager::currentNM(); + Assert(r.getKind() == REGEXP_CONCAT); + Node zero = nm->mkConst(Rational(0)); + // The following simplification states that + // ~( s in R1 ++ R2 ++... ++ Rn ) + // is equivalent to + // forall x. + // 0 <= x <= len(s) => + // ~(substr(s,0,x) in R1) OR ~(substr(s,x,len(s)-x) in R2 ++ ... ++ Rn) + // Index is the child index of r that we are stripping off, which is either + // from the beginning or the end. + Assert(index == 0 || index == r.getNumChildren() - 1); + Node lens = nm->mkNode(STRING_LENGTH, s); + Node b1; + Node b1v; + Node guard; + if (reLen.isNull()) + { + b1 = SkolemCache::mkIndexVar(mem); + b1v = nm->mkNode(BOUND_VAR_LIST, b1); + guard = nm->mkNode(AND, + nm->mkNode(GEQ, b1, zero), + nm->mkNode(GEQ, nm->mkNode(STRING_LENGTH, s), b1)); + } + else + { + b1 = reLen; + } + Node s1; + Node s2; + if (index == 0) + { + s1 = nm->mkNode(STRING_SUBSTR, s, zero, b1); + s2 = nm->mkNode(STRING_SUBSTR, s, b1, nm->mkNode(MINUS, lens, b1)); + } + else + { + s1 = nm->mkNode(STRING_SUBSTR, s, nm->mkNode(MINUS, lens, b1), b1); + s2 = nm->mkNode(STRING_SUBSTR, s, zero, nm->mkNode(MINUS, lens, b1)); + } + Node s1r1 = nm->mkNode(STRING_IN_REGEXP, s1, r[index]).negate(); + std::vector nvec; + for (unsigned i = 0, nchild = r.getNumChildren(); i < nchild; i++) + { + if (i != index) + { + nvec.push_back(r[i]); + } + } + Node r2 = nvec.size() == 1 ? nvec[0] : nm->mkNode(REGEXP_CONCAT, nvec); + r2 = Rewriter::rewrite(r2); + Node s2r2 = nm->mkNode(STRING_IN_REGEXP, s2, r2).negate(); + Node conc = nm->mkNode(OR, s1r1, s2r2); + if (!b1v.isNull()) + { + conc = nm->mkNode(OR, guard.negate(), conc); + conc = nm->mkNode(FORALL, b1v, conc); + } + return conc; +} + +Node RegExpOpr::reduceRegExpPos(Node mem, SkolemCache* sc) +{ + Assert(mem.getKind() == STRING_IN_REGEXP); + Node s = mem[0]; + Node r = mem[1]; + NodeManager* nm = NodeManager::currentNM(); + Kind k = r.getKind(); + Node conc; + if (k == REGEXP_CONCAT) + { + std::vector nvec; + std::vector cc; + // get the (valid) existential for this membership + Node eform = getExistsForRegExpConcatMem(mem); + SkolemManager* sm = nm->getSkolemManager(); + // Notice that this rule does not introduce witness terms, instead it + // uses skolems in the conclusion of the proof rule directly. Thus, + // the existential eform does not need to be explicitly justified by a + // proof here, since it is only being used as an intermediate formula in + // this inference. Hence we do not pass a proof generator to mkSkolemize. + std::vector skolems; + sm->mkSkolemize(eform, skolems, "rc", "regexp concat skolem"); + Assert(skolems.size() == r.getNumChildren()); + // Look up skolems for each of the components. If sc has optimizations + // enabled, this will return arguments of str.to_re. + for (unsigned i = 0, nchild = r.getNumChildren(); i < nchild; ++i) + { + if (r[i].getKind() == STRING_TO_REGEXP) { - // s in complement(R) ---> ~( s in R ) - conc = nm->mkNode(STRING_IN_REGEXP, s, r[0]).negate(); - break; + // optimization, just take the body + skolems[i] = r[i][0]; } - default: { - Assert(!utils::isRegExpKind(k)); - break; + else + { + nvec.push_back(nm->mkNode(STRING_IN_REGEXP, skolems[i], r[i])); } } - if (!conc.isNull()) - { - conc = Rewriter::rewrite(conc); - new_nodes.push_back(conc); - d_simpl_cache[p] = conc; - } + // (str.in_re x (re.++ R1 .... Rn)) => + // (and (str.in_re k1 R1) ... (str.in_re kn Rn) (= x (str.++ k1 ... kn))) + Node lem = s.eqNode(nm->mkNode(STRING_CONCAT, skolems)); + nvec.push_back(lem); + conc = nvec.size() == 1 ? nvec[0] : nm->mkNode(AND, nvec); + } + else if (k == REGEXP_STAR) + { + Node emp = Word::mkEmptyWord(s.getType()); + Node se = s.eqNode(emp); + Node sinr = nm->mkNode(STRING_IN_REGEXP, s, r[0]); + Node reExpand = nm->mkNode(REGEXP_CONCAT, r[0], r, r[0]); + Node sinRExp = nm->mkNode(STRING_IN_REGEXP, s, reExpand); + // We unfold `x in R*` by considering three cases: `x` is empty, `x` + // is matched by `R`, or `x` is matched by two or more `R`s. For the + // last case, `x` will break into three pieces, making the beginning + // and the end each match `R` and the middle match `R*`. Matching the + // beginning and the end with `R` allows us to reason about the + // beginning and the end of `x` simultaneously. + // + // x in R* ---> (x = "") v (x in R) v (x in (re.++ R (re.* R) R)) + + // We also immediately unfold the last disjunct for re.*. The advantage + // of doing this is that we use the same scheme for skolems above. + sinRExp = reduceRegExpPos(sinRExp, sc); + // make the return lemma + conc = nm->mkNode(OR, se, sinr, sinRExp); } + else + { + Assert(!utils::isRegExpKind(k)); + } + return conc; } bool RegExpOpr::isPairNodesInSet(std::set< PairNodes > &s, Node n1, Node n2) { @@ -1384,24 +1192,6 @@ void RegExpOpr::convert2(unsigned cnt, Node n, Node &r1, Node &r2) { } } -bool RegExpOpr::testNoRV(Node r) { - std::map< Node, bool >::const_iterator itr = d_norv_cache.find(r); - if(itr != d_norv_cache.end()) { - return itr->second; - } else { - if(r.getKind() == kind::REGEXP_RV) { - return false; - } else if(r.getNumChildren() > 1) { - for(unsigned int i=0; i cache, unsigned cnt ) { //Assert(checkConstRegExp(r1) && checkConstRegExp(r2)); if(r1 > r2) { @@ -1410,13 +1200,6 @@ Node RegExpOpr::intersectInternal( Node r1, Node r2, std::map< PairNodes, Node > r2 = tmpNode; } Trace("regexp-int") << "Starting INTERSECT(" << cnt << "):\n "<< mkString(r1) << ",\n " << mkString(r2) << std::endl; - //if(Trace.isOn("regexp-debug")) { - // Trace("regexp-debug") << "... with cache:\n"; - // for(std::map< PairNodes, Node >::const_iterator itr=cache.begin(); - // itr!=cache.end();itr++) { - // Trace("regexp-debug") << "(" << itr->first.first << "," << itr->first.second << ")->" << itr->second << std::endl; - // } - //} std::pair < Node, Node > p(r1, r2); std::map < PairNodes, Node >::const_iterator itr = d_inter_cache.find(p); Node rNode; @@ -1528,7 +1311,8 @@ Node RegExpOpr::intersectInternal( Node r1, Node r2, std::map< PairNodes, Node > } } Trace("regexp-int-debug") << " ... try testing no RV of " << mkString(rNode) << std::endl; - if(testNoRV(rNode)) { + if (!expr::hasSubtermKind(REGEXP_RV, rNode)) + { d_inter_cache[p] = rNode; } } @@ -1538,80 +1322,103 @@ Node RegExpOpr::intersectInternal( Node r1, Node r2, std::map< PairNodes, Node > Node RegExpOpr::removeIntersection(Node r) { Assert(checkConstRegExp(r)); - std::map < Node, Node >::const_iterator itr = d_rm_inter_cache.find(r); - if(itr != d_rm_inter_cache.end()) { - return itr->second; - } - Node retNode; - Kind rk = r.getKind(); - switch (rk) + NodeManager* nm = NodeManager::currentNM(); + std::unordered_map visited; + std::unordered_map::iterator it; + std::vector visit; + TNode cur; + visit.push_back(r); + do { - case REGEXP_EMPTY: - case REGEXP_SIGMA: - case REGEXP_RANGE: - case STRING_TO_REGEXP: - { - retNode = r; - break; - } - case REGEXP_CONCAT: - case REGEXP_UNION: - case REGEXP_STAR: - case REGEXP_COMPLEMENT: + cur = visit.back(); + visit.pop_back(); + it = visited.find(cur); + + if (it == visited.end()) { - NodeBuilder<> nb(rk); - for (const Node& rc : r) + visited[cur] = Node::null(); + visit.push_back(cur); + for (const Node& cn : cur) { - nb << removeIntersection(rc); + visit.push_back(cn); } - retNode = Rewriter::rewrite(nb.constructNode()); - break; } - - case REGEXP_INTER: + else if (it->second.isNull()) { - retNode = removeIntersection(r[0]); - for (size_t i = 1, nchild = r.getNumChildren(); i < nchild; i++) + Kind ck = cur.getKind(); + Node ret; + bool childChanged = false; + std::vector children; + if (cur.getMetaKind() == metakind::PARAMETERIZED) { - bool spflag = false; - Node tmpNode = removeIntersection(r[i]); - retNode = intersect(retNode, tmpNode, spflag); + children.push_back(cur.getOperator()); } - break; - } - case REGEXP_LOOP: - { - retNode = removeIntersection(r[0]); - retNode = Rewriter::rewrite( - NodeManager::currentNM()->mkNode(REGEXP_LOOP, retNode, r[1], r[2])); - break; - } - default: - { - Unreachable(); + for (const Node& cn : cur) + { + it = visited.find(cn); + Assert(it != visited.end()); + Assert(!it->second.isNull()); + if (ck == REGEXP_INTER) + { + if (ret.isNull()) + { + ret = it->second; + } + else + { + ret = intersect(ret, it->second); + } + } + else + { + // will construct below + childChanged = childChanged || cn != it->second; + children.push_back(it->second); + } + } + if (ck != REGEXP_INTER) + { + if (childChanged) + { + ret = nm->mkNode(cur.getKind(), children); + } + else + { + ret = cur; + } + } + visited[cur] = ret; } + } while (!visit.empty()); + Assert(visited.find(r) != visited.end()); + Assert(!visited.find(r)->second.isNull()); + if (Trace.isOn("regexp-intersect")) + { + Trace("regexp-intersect") << "Remove INTERSECTION( " << mkString(r) + << " ) = " << mkString(visited[r]) << std::endl; } - d_rm_inter_cache[r] = retNode; - Trace("regexp-intersect") << "Remove INTERSECTION( " << mkString(r) << " ) = " << mkString(retNode) << std::endl; - return retNode; + return visited[r]; } -Node RegExpOpr::intersect(Node r1, Node r2, bool &spflag) { - if(checkConstRegExp(r1) && checkConstRegExp(r2)) { - Node rr1 = removeIntersection(r1); - Node rr2 = removeIntersection(r2); - std::map< PairNodes, Node > cache; - Trace("regexp-intersect-node") << "Intersect (1): " << rr1 << std::endl; - Trace("regexp-intersect-node") << "Intersect (2): " << rr2 << std::endl; - Trace("regexp-intersect") << "Start INTERSECTION(\n\t" << mkString(r1) << ",\n\t"<< mkString(r2) << ")" << std::endl; - Node retNode = intersectInternal(rr1, rr2, cache, 1); - Trace("regexp-intersect") << "End INTERSECTION(\n\t" << mkString(r1) << ",\n\t"<< mkString(r2) << ") =\n\t" << mkString(retNode) << std::endl; - Trace("regexp-intersect-node") << "Intersect finished." << std::endl; - return retNode; - } else { - spflag = true; +Node RegExpOpr::intersect(Node r1, Node r2) +{ + if (!checkConstRegExp(r1) || !checkConstRegExp(r2)) + { return Node::null(); } + Node rr1 = removeIntersection(r1); + Node rr2 = removeIntersection(r2); + std::map cache; + Trace("regexp-intersect-node") << "Intersect (1): " << rr1 << std::endl; + Trace("regexp-intersect-node") << "Intersect (2): " << rr2 << std::endl; + Trace("regexp-intersect") << "Start INTERSECTION(\n\t" << mkString(r1) + << ",\n\t" << mkString(r2) << ")" << std::endl; + Node retNode = intersectInternal(rr1, rr2, cache, 1); + Trace("regexp-intersect") + << "End INTERSECTION(\n\t" << mkString(r1) << ",\n\t" << mkString(r2) + << ") =\n\t" << mkString(retNode) << std::endl; + Trace("regexp-intersect-node") << "Intersect finished." << std::endl; + return retNode; } //printing @@ -1695,16 +1502,15 @@ std::string RegExpOpr::mkString( Node r ) { break; } case kind::REGEXP_LOOP: { - retStr += "("; - retStr += mkString(r[0]); - retStr += ")"; - retStr += "{"; - retStr += r[1].getConst().toString(); - retStr += ","; + uint32_t l = utils::getLoopMinOccurrences(r); + std::stringstream ss; + ss << "(" << mkString(r[0]) << "){" << l << ","; if(r.getNumChildren() == 3) { - retStr += r[2].getConst().toString(); + uint32_t u = utils::getLoopMaxOccurrences(r); + ss << u; } - retStr += "}"; + ss << "}"; + retStr += ss.str(); break; } case kind::REGEXP_RV: { @@ -1746,6 +1552,50 @@ bool RegExpOpr::regExpIncludes(Node r1, Node r2) return result; } +/** + * Associating formulas with their "exists form", or an existentially + * quantified formula that is equivalent to it. This is currently used + * for regular expression memberships in the method below. + */ +struct ExistsFormAttributeId +{ +}; +typedef expr::Attribute ExistsFormAttribute; + +Node RegExpOpr::getExistsForRegExpConcatMem(Node mem) +{ + // get or make the exists form of the membership + ExistsFormAttribute efa; + if (mem.hasAttribute(efa)) + { + // already computed + return mem.getAttribute(efa); + } + Assert(mem.getKind() == STRING_IN_REGEXP); + Node x = mem[0]; + Node r = mem[1]; + Assert(r.getKind() == REGEXP_CONCAT); + NodeManager* nm = NodeManager::currentNM(); + TypeNode xtn = x.getType(); + std::vector vars; + std::vector mems; + for (const Node& rc : r) + { + Node v = nm->mkBoundVar(xtn); + vars.push_back(v); + mems.push_back(nm->mkNode(STRING_IN_REGEXP, v, rc)); + } + Node sconcat = nm->mkNode(STRING_CONCAT, vars); + Node eq = x.eqNode(sconcat); + mems.insert(mems.begin(), eq); + Node bvl = nm->mkNode(BOUND_VAR_LIST, vars); + Node ebody = nm->mkNode(AND, mems); + Node eform = nm->mkNode(EXISTS, bvl, ebody); + mem.setAttribute(efa, eform); + Trace("regexp-opr") << "Exists form " << mem << " : " << eform << std::endl; + return eform; +} + }/* CVC4::theory::strings namespace */ }/* CVC4::theory namespace */ }/* CVC4 namespace */ diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h index d0b0755eb..66be4036b 100644 --- a/src/theory/strings/regexp_operation.h +++ b/src/theory/strings/regexp_operation.h @@ -19,14 +19,14 @@ #ifndef CVC4__THEORY__STRINGS__REGEXP__OPERATION_H #define CVC4__THEORY__STRINGS__REGEXP__OPERATION_H -#include +#include #include -#include -#include -#include "util/hash.h" +#include +#include + +#include "expr/node.h" +#include "theory/strings/skolem_cache.h" #include "util/string.h" -#include "theory/theory.h" -#include "theory/rewriter.h" namespace CVC4 { namespace theory { @@ -73,23 +73,17 @@ class RegExpOpr { Node d_sigma; Node d_sigma_star; - std::map d_simpl_cache; - std::map d_simpl_neg_cache; + /** A cache for simplify */ + std::map d_simpCache; std::map > d_delta_cache; std::map d_dv_cache; std::map > d_deriv_cache; - std::map > d_compl_cache; /** cache mapping regular expressions to whether they contain constants */ std::unordered_map d_constCache; - std::map, std::set > > d_cset_cache; std::map, std::set > > d_fset_cache; std::map d_inter_cache; - std::map d_rm_inter_cache; - std::map d_norv_cache; std::map > d_split_cache; std::map d_inclusionCache; - void simplifyPRegExp(Node s, Node r, std::vector &new_nodes); - void simplifyNRegExp(Node s, Node r, std::vector &new_nodes); /** * Helper function for mkString, pretty prints constant or variable regular * expression r. @@ -101,16 +95,19 @@ class RegExpOpr { bool containC2(unsigned cnt, Node n); Node convert1(unsigned cnt, Node n); void convert2(unsigned cnt, Node n, Node &r1, Node &r2); - bool testNoRV(Node r); Node intersectInternal(Node r1, Node r2, std::map cache, unsigned cnt); + /** + * Given a regular expression r, this returns an equivalent regular expression + * that contains no applications of intersection. + */ Node removeIntersection(Node r); void firstChars(Node r, std::set &pcset, SetNodes &pvset); public: - RegExpOpr(); + RegExpOpr(SkolemCache* sc); ~RegExpOpr(); /** @@ -121,7 +118,42 @@ class RegExpOpr { bool checkConstRegExp( Node r ); /** get the constant type for regular expression r */ RegExpConstType getRegExpConstType(Node r); - void simplify(Node t, std::vector< Node > &new_nodes, bool polarity); + /** Simplify + * + * This is the main method to simplify (unfold) a regular expression + * membership. It is called where t is of the form (str.in_re s r), + * and t (or (not t), when polarity=false) holds in the current context. + * It returns the unfolded form of t. + */ + Node simplify(Node t, bool polarity); + /** + * Given regular expression of the form + * (re.++ r_0 ... r_{n-1}) + * This returns a non-null node reLen and updates index such that + * RegExpEntail::getFixedLengthForRegexp(r_index) = reLen + * where index is set to either 0 or n-1. + */ + static Node getRegExpConcatFixed(Node r, size_t& index); + //------------------------ trusted reductions + /** + * Return the unfolded form of mem of the form (str.in_re s r). + */ + static Node reduceRegExpPos(Node mem, SkolemCache* sc); + /** + * Return the unfolded form of mem of the form (not (str.in_re s r)). + */ + static Node reduceRegExpNeg(Node mem); + /** + * Return the unfolded form of mem of the form + * (not (str.in_re s (re.++ r_0 ... r_{n-1}))) + * Called when RegExpEntail::getFixedLengthForRegexp(r_index) = reLen + * where index is either 0 or n-1. + * + * This uses reLen as an optimization to improve the reduction. If reLen + * is null, then this optimization is not applied. + */ + static Node reduceRegExpNegConcatFixed(Node mem, Node reLen, size_t index); + //------------------------ end trusted reductions /** * This method returns 1 if the empty string is in r, 2 if the empty string * is not in r, or 0 if it is unknown whether the empty string is in r. @@ -141,9 +173,9 @@ class RegExpOpr { Node derivativeSingle( Node r, CVC4::String c ); /** * Returns the regular expression intersection of r1 and r2. If r1 or r2 is - * not constant, then this method returns null and sets spflag to true. + * not constant, then this method returns null. */ - Node intersect(Node r1, Node r2, bool &spflag); + Node intersect(Node r1, Node r2); /** Get the pretty printed version of the regular expression r */ static std::string mkString(Node r); @@ -155,6 +187,22 @@ class RegExpOpr { * for performance reasons. */ bool regExpIncludes(Node r1, Node r2); + + private: + /** + * Given a regular expression membership of the form: + * (str.in_re x (re.++ R1 ... Rn)) + * This returns the valid existentially quantified formula: + * (exists ((x1 String) ... (xn String)) + * (=> (str.in_re x (re.++ R1 ... Rn)) + * (and (= x (str.++ x1 ... xn)) + * (str.in_re x1 R1) ... (str.in_re xn Rn)))) + * Moreover, this formula is cached per regular expression membership via + * an attribute, meaning it is always the same for a given membership mem. + */ + static Node getExistsForRegExpConcatMem(Node mem); + /** pointer to the skolem cache used by this class */ + SkolemCache* d_sc; }; }/* CVC4::theory::strings namespace */ diff --git a/src/theory/strings/regexp_solver.cpp b/src/theory/strings/regexp_solver.cpp index 53c6c9acc..c5d6df601 100644 --- a/src/theory/strings/regexp_solver.cpp +++ b/src/theory/strings/regexp_solver.cpp @@ -33,19 +33,19 @@ namespace strings { RegExpSolver::RegExpSolver(SolverState& s, InferenceManager& im, + SkolemCache* skc, CoreSolver& cs, ExtfSolver& es, - SequencesStatistics& stats, - context::Context* c, - context::UserContext* u) + SequencesStatistics& stats) : d_state(s), d_im(im), d_csolver(cs), d_esolver(es), d_statistics(stats), - d_regexp_ucached(u), - d_regexp_ccached(c), - d_processed_memberships(c) + d_regexp_ucached(s.getUserContext()), + d_regexp_ccached(s.getSatContext()), + d_processed_memberships(s.getSatContext()), + d_regexp_opr(skc) { d_emptyString = NodeManager::currentNM()->mkConst(::CVC4::String("")); std::vector nvec; @@ -260,16 +260,14 @@ void RegExpSolver::check(const std::map >& mems) << "Unroll/simplify membership of atomic term " << rep << std::endl; // if so, do simple unrolling - std::vector nvec; Trace("strings-regexp") << "Simplify on " << atom << std::endl; - d_regexp_opr.simplify(atom, nvec, polarity); + Node conc = d_regexp_opr.simplify(atom, polarity); Trace("strings-regexp") << "...finished" << std::endl; // if simplifying successfully generated a lemma - if (!nvec.empty()) + if (!conc.isNull()) { std::vector exp_n; exp_n.push_back(assertion); - Node conc = nvec.size() == 1 ? nvec[0] : nm->mkNode(AND, nvec); Assert(atom.getKind() == STRING_IN_REGEXP); if (polarity) { @@ -468,11 +466,9 @@ bool RegExpSolver::checkEqcIntersect(const std::vector& mems) rcti = rct; continue; } - bool spflag = false; - Node resR = d_regexp_opr.intersect(mi[1], m[1], spflag); + Node resR = d_regexp_opr.intersect(mi[1], m[1]); // intersection should be computable Assert(!resR.isNull()); - Assert(!spflag); if (resR == d_emptyRegexp) { // conflict, explain diff --git a/src/theory/strings/regexp_solver.h b/src/theory/strings/regexp_solver.h index 9e9ba5845..59afd5ba3 100644 --- a/src/theory/strings/regexp_solver.h +++ b/src/theory/strings/regexp_solver.h @@ -25,6 +25,7 @@ #include "expr/node.h" #include "theory/strings/extf_solver.h" #include "theory/strings/inference_manager.h" +#include "theory/strings/skolem_cache.h" #include "theory/strings/regexp_operation.h" #include "theory/strings/sequences_stats.h" #include "theory/strings/solver_state.h" @@ -46,11 +47,10 @@ class RegExpSolver public: RegExpSolver(SolverState& s, InferenceManager& im, + SkolemCache* skc, CoreSolver& cs, ExtfSolver& es, - SequencesStatistics& stats, - context::Context* c, - context::UserContext* u); + SequencesStatistics& stats); ~RegExpSolver() {} /** check regular expression memberships diff --git a/src/theory/strings/theory_strings.cpp b/src/theory/strings/theory_strings.cpp index 102826591..2c7573949 100644 --- a/src/theory/strings/theory_strings.cpp +++ b/src/theory/strings/theory_strings.cpp @@ -61,7 +61,8 @@ TheoryStrings::TheoryStrings(context::Context* c, d_csolver, d_extTheory, d_statistics), - d_rsolver(d_state, d_im, d_csolver, d_esolver, d_statistics, c, u), + d_rsolver(d_state, d_im, d_termReg.getSkolemCache(), + d_csolver, d_esolver, d_statistics), d_stringsFmf(c, u, valuation, d_termReg) { bool eagerEval = options::stringEagerEval(); diff --git a/test/unit/theory/regexp_operation_black.h b/test/unit/theory/regexp_operation_black.h index 6e01392ab..551d59bef 100644 --- a/test/unit/theory/regexp_operation_black.h +++ b/test/unit/theory/regexp_operation_black.h @@ -25,7 +25,9 @@ #include "expr/node_manager.h" #include "smt/smt_engine.h" #include "smt/smt_engine_scope.h" +#include "theory/rewriter.h" #include "theory/strings/regexp_operation.h" +#include "theory/strings/skolem_cache.h" using namespace CVC4; using namespace CVC4::kind; @@ -46,7 +48,8 @@ class RegexpOperationBlack : public CxxTest::TestSuite d_em = d_slv->getExprManager(); d_smt = d_slv->getSmtEngine(); d_scope = new SmtScope(d_smt); - d_regExpOpr = new RegExpOpr(); + d_skc = new SkolemCache(); + d_regExpOpr = new RegExpOpr(d_skc); // Ensure that the SMT engine is fully initialized (required for the // rewriter) @@ -58,6 +61,7 @@ class RegexpOperationBlack : public CxxTest::TestSuite void tearDown() override { delete d_regExpOpr; + delete d_skc; delete d_scope; delete d_slv; } @@ -153,6 +157,7 @@ class RegexpOperationBlack : public CxxTest::TestSuite ExprManager* d_em; SmtEngine* d_smt; SmtScope* d_scope; + SkolemCache* d_skc; RegExpOpr* d_regExpOpr; NodeManager* d_nm; -- 2.30.2