From c9f50eed48fa920320f93921415885f0042b9d39 Mon Sep 17 00:00:00 2001 From: Andrew Reynolds Date: Mon, 23 Jul 2018 18:02:36 -0500 Subject: [PATCH] Improve rewriter for regular expression concatenation (#2196) --- .../strings/theory_strings_rewriter.cpp | 218 ++++++++++-------- 1 file changed, 128 insertions(+), 90 deletions(-) diff --git a/src/theory/strings/theory_strings_rewriter.cpp b/src/theory/strings/theory_strings_rewriter.cpp index cd7c6eeb4..8c589640c 100644 --- a/src/theory/strings/theory_strings_rewriter.cpp +++ b/src/theory/strings/theory_strings_rewriter.cpp @@ -387,77 +387,115 @@ Node TheoryStringsRewriter::rewriteConcat(Node node) Node TheoryStringsRewriter::prerewriteConcatRegExp( TNode node ) { Assert( node.getKind() == kind::REGEXP_CONCAT ); - Trace("strings-prerewrite") << "Strings::prerewriteConcatRegExp start " << node << std::endl; + NodeManager* nm = NodeManager::currentNM(); + Trace("strings-prerewrite") + << "Strings::prerewriteConcatRegExp flatten " << node << std::endl; Node retNode = node; - std::vector node_vec; - Node preNode = Node::null(); - bool emptyflag = false; - for(unsigned int i=0; imkNode( - kind::STRING_CONCAT, preNode, tmpNode[0][0])); - node_vec.push_back( NodeManager::currentNM()->mkNode( kind::STRING_TO_REGEXP, preNode ) ); - preNode = Node::null(); - } else { - node_vec.push_back( NodeManager::currentNM()->mkNode( kind::STRING_TO_REGEXP, preNode ) ); - preNode = Node::null(); - node_vec.push_back( tmpNode[0] ); - } - ++j; - } - for(; j vec; + bool changed = false; + Node emptyRe; + for (const Node& c : node) + { + if (c.getKind() == REGEXP_CONCAT) + { + changed = true; + for (const Node& cc : c) + { + vec.push_back(cc); } } - if( tmpNode.getKind() == kind::STRING_TO_REGEXP ) { - if(preNode.isNull()) { - preNode = tmpNode[0]; - } else { - preNode = rewriteConcat(NodeManager::currentNM()->mkNode( - kind::STRING_CONCAT, preNode, tmpNode[0])); - } - } else if( tmpNode.getKind() == kind::REGEXP_EMPTY ) { - emptyflag = true; - break; - } else { - if(!preNode.isNull()) { - if(preNode.getKind() == kind::CONST_STRING && preNode.getConst().isEmptyString() ) { - preNode = Node::null(); - } else { - node_vec.push_back( NodeManager::currentNM()->mkNode( kind::STRING_TO_REGEXP, preNode ) ); - preNode = Node::null(); + else if (c.getKind() == STRING_TO_REGEXP && c[0].isConst() + && c[0].getConst().isEmptyString()) + { + changed = true; + emptyRe = c; + } + else if (c.getKind() == REGEXP_EMPTY) + { + // re.++( ..., empty, ... ) ---> empty + std::vector nvec; + return nm->mkNode(REGEXP_EMPTY, nvec); + } + else + { + vec.push_back(c); + } + } + if (changed) + { + // flatten + // this handles nested re.++ and elimination or str.to.re(""), e.g.: + // re.++( re.++( R1, R2 ), str.to.re(""), R3 ) ---> re.++( R1, R2, R3 ) + if (vec.empty()) + { + Assert(!emptyRe.isNull()); + retNode = emptyRe; + } + else + { + retNode = vec.size() == 1 ? vec[0] : nm->mkNode(REGEXP_CONCAT, vec); + } + return returnRewrite(node, retNode, "re.concat-flatten"); + } + Trace("strings-prerewrite") + << "Strings::prerewriteConcatRegExp start " << node << std::endl; + std::vector cvec; + std::vector preReStr; + for (unsigned i = 0, size = vec.size(); i <= size; i++) + { + Node curr; + if (i < size) + { + curr = vec[i]; + Assert(curr.getKind() != REGEXP_CONCAT); + if (!cvec.empty() && preReStr.empty()) + { + Node cvecLast = cvec.back(); + if (cvecLast.getKind() == REGEXP_STAR && cvecLast[0] == curr) + { + // by convention, flip the order (a*)++a ---> a++(a*) + cvec[cvec.size() - 1] = curr; + cvec.push_back(cvecLast); + curr = Node::null(); } } - node_vec.push_back( tmpNode ); } - } - if(emptyflag) { - std::vector< Node > nvec; - retNode = NodeManager::currentNM()->mkNode( kind::REGEXP_EMPTY, nvec ); - } else { - if(!preNode.isNull()) { - bool bflag = (preNode.getKind() == kind::CONST_STRING && preNode.getConst().isEmptyString() ); - if(node_vec.empty() || !bflag ) { - node_vec.push_back( NodeManager::currentNM()->mkNode( kind::STRING_TO_REGEXP, preNode ) ); + // update preReStr + if (!curr.isNull() && curr.getKind() == STRING_TO_REGEXP) + { + preReStr.push_back(curr[0]); + curr = Node::null(); + } + else if (!preReStr.empty()) + { + // this groups consecutive strings a++b ---> ab + Node acc = + nm->mkNode(STRING_TO_REGEXP, mkConcat(STRING_CONCAT, preReStr)); + cvec.push_back(acc); + preReStr.clear(); + } + if (!curr.isNull() && curr.getKind() == REGEXP_STAR) + { + // we can group stars (a*)++(a*) ---> a* + if (!cvec.empty() && cvec.back() == curr) + { + curr = Node::null(); } } - if(node_vec.size() > 1) { - retNode = NodeManager::currentNM()->mkNode(kind::REGEXP_CONCAT, node_vec); - } else { - retNode = node_vec[0]; + if (!curr.isNull()) + { + cvec.push_back(curr); } } - Trace("strings-prerewrite") << "Strings::prerewriteConcatRegExp end " << retNode << std::endl; - return retNode; + Assert(!cvec.empty()); + retNode = mkConcat(REGEXP_CONCAT, cvec); + if (retNode != node) + { + // handles all cases where consecutive re constants are combined, and cases + // where arguments are swapped, as described in the loop above. + return returnRewrite(node, retNode, "re.concat"); + } + return node; } Node TheoryStringsRewriter::prerewriteOrRegExp(TNode node) { @@ -728,6 +766,7 @@ bool TheoryStringsRewriter::testConstStringInRegExp( CVC4::String &s, unsigned i } Node TheoryStringsRewriter::rewriteMembership(TNode node) { + NodeManager* nm = NodeManager::currentNM(); Node retNode = node; Node x = node[0]; Node r = node[1]; @@ -739,10 +778,11 @@ Node TheoryStringsRewriter::rewriteMembership(TNode node) { CVC4::String s = x.getConst(); retNode = NodeManager::currentNM()->mkConst( testConstStringInRegExp( s, 0, r ) ); } else if(r.getKind() == kind::REGEXP_SIGMA) { - Node one = NodeManager::currentNM()->mkConst( ::CVC4::Rational(1) ); - retNode = one.eqNode(NodeManager::currentNM()->mkNode(kind::STRING_LENGTH, x)); + Node one = nm->mkConst(Rational(1)); + retNode = one.eqNode(nm->mkNode(STRING_LENGTH, x)); } else if( r.getKind() == kind::REGEXP_STAR ) { - if( r[0].getKind() == kind::REGEXP_SIGMA ){ + if (r[0].getKind() == kind::REGEXP_SIGMA) + { retNode = NodeManager::currentNM()->mkConst( true ); } }else if( r.getKind() == kind::REGEXP_CONCAT ){ @@ -774,6 +814,14 @@ Node TheoryStringsRewriter::rewriteMembership(TNode node) { retNode = NodeManager::currentNM()->mkNode( r.getKind()==kind::REGEXP_INTER ? kind::AND : kind::OR, mvec ); }else if(r.getKind() == kind::STRING_TO_REGEXP) { retNode = x.eqNode(r[0]); + } + else if (r.getKind() == REGEXP_RANGE) + { + // x in re.range( char_i, char_j ) ---> i <= str.code(x) <= j + Node xcode = nm->mkNode(STRING_CODE, x); + retNode = nm->mkNode(AND, + nm->mkNode(LEQ, nm->mkNode(STRING_CODE, r[0]), xcode), + nm->mkNode(LEQ, xcode, nm->mkNode(STRING_CODE, r[1]))); }else if(x != node[0] || r != node[1]) { retNode = NodeManager::currentNM()->mkNode( kind::STRING_IN_REGEXP, x, r ); } @@ -851,16 +899,11 @@ RewriteResponse TheoryStringsRewriter::postRewrite(TNode node) { Node tmpNode = node[0]; if(tmpNode.isConst()) { retNode = NodeManager::currentNM()->mkConst( ::CVC4::Rational( tmpNode.getConst().size() ) ); - //} else if(tmpNode.getKind() == kind::STRING_SUBSTR) { - //retNode = tmpNode[2]; }else if( tmpNode.getKind()==kind::STRING_CONCAT ){ - // it has to be string concat std::vector node_vec; for(unsigned int i=0; imkConst( ::CVC4::Rational( tmpNode[i].getConst().size() ) ) ); - //} else if(tmpNode[i].getKind() == kind::STRING_SUBSTR) { - // node_vec.push_back( tmpNode[i][2] ); } else { node_vec.push_back( NodeManager::currentNM()->mkNode(kind::STRING_LENGTH, tmpNode[i]) ); } @@ -962,6 +1005,7 @@ RewriteResponse TheoryStringsRewriter::preRewrite(TNode node) { Node retNode = node; Node orig = retNode; Trace("strings-prerewrite") << "Strings::preRewrite start " << node << std::endl; + NodeManager* nm = NodeManager::currentNM(); if (node.getKind() == kind::REGEXP_CONCAT) { @@ -1009,18 +1053,6 @@ RewriteResponse TheoryStringsRewriter::preRewrite(TNode node) { if(node[0] == node[1]) { retNode = NodeManager::currentNM()->mkNode( kind::STRING_TO_REGEXP, node[0] ); } - /*std::vector< Node > vec_nodes; - unsigned char c = node[0].getConst().getFirstChar(); - unsigned char end = node[1].getConst().getFirstChar(); - for(; c<=end; ++c) { - Node n = NodeManager::currentNM()->mkNode( kind::STRING_TO_REGEXP, NodeManager::currentNM()->mkConst( ::CVC4::String( c ) ) ); - vec_nodes.push_back( n ); - } - if(vec_nodes.size() == 1) { - retNode = vec_nodes[0]; - } else { - retNode = NodeManager::currentNM()->mkNode( kind::REGEXP_UNION, vec_nodes ); - }*/ } else if(node.getKind() == kind::REGEXP_LOOP) { Node r = node[0]; if(r.getKind() == kind::REGEXP_STAR) { @@ -1047,8 +1079,11 @@ RewriteResponse TheoryStringsRewriter::preRewrite(TNode node) { //if(!n2.isConst()) { // throw LogicException("re.loop contains non-constant integer (2)."); //} - Node n = vec_nodes.size()==0 ? NodeManager::currentNM()->mkNode(kind::STRING_TO_REGEXP, NodeManager::currentNM()->mkConst(CVC4::String(""))) - : vec_nodes.size()==1 ? r : prerewriteConcatRegExp(NodeManager::currentNM()->mkNode(kind::REGEXP_CONCAT, vec_nodes)); + Node n = vec_nodes.size() == 0 + ? nm->mkNode(STRING_TO_REGEXP, nm->mkConst(String(""))) + : vec_nodes.size() == 1 + ? r + : nm->mkNode(REGEXP_CONCAT, vec_nodes); //Assert(n2.getConst() <= RMAXINT, "Exceeded LONG_MAX in string REGEXP_LOOP (2)"); unsigned u = n2.getConst().getNumerator().toUnsignedInt(); if(u <= l) { @@ -1058,17 +1093,20 @@ RewriteResponse TheoryStringsRewriter::preRewrite(TNode node) { vec2.push_back(n); for(unsigned j=l; jmkNode(kind::REGEXP_CONCAT, vec_nodes)); + n = mkConcat(REGEXP_CONCAT, vec_nodes); vec2.push_back(n); } - retNode = prerewriteOrRegExp(NodeManager::currentNM()->mkNode(kind::REGEXP_UNION, vec2)); + retNode = prerewriteOrRegExp(nm->mkNode(REGEXP_UNION, vec2)); } } else { - Node rest = NodeManager::currentNM()->mkNode(kind::REGEXP_STAR, r); - retNode = vec_nodes.size()==0? rest : prerewriteConcatRegExp( vec_nodes.size()==1? - NodeManager::currentNM()->mkNode(kind::REGEXP_CONCAT, r, rest) - :NodeManager::currentNM()->mkNode(kind::REGEXP_CONCAT, - NodeManager::currentNM()->mkNode(kind::REGEXP_CONCAT, vec_nodes), rest) ); + Node rest = nm->mkNode(REGEXP_STAR, r); + retNode = vec_nodes.size() == 0 + ? rest + : vec_nodes.size() == 1 + ? nm->mkNode(REGEXP_CONCAT, r, rest) + : nm->mkNode(REGEXP_CONCAT, + nm->mkNode(REGEXP_CONCAT, vec_nodes), + rest); } } Trace("strings-lp") << "Strings::lp " << node << " => " << retNode << std::endl; -- 2.30.2