From: Andrew Reynolds Date: Sat, 28 Oct 2017 04:16:38 +0000 (-0500) Subject: Improve strings rewriter for contains (#1207) X-Git-Tag: cvc5-1.0.0~5526 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=675e82e32a34911163f9de0e6389eb107be5b0f1;p=cvc5.git Improve strings rewriter for contains (#1207) * Work on rewriter for string contains. * Add rewrites that mix str.to.int and str.contains. Documentation, add regression. * Minor * Minor * Address review, add a few TODOs. Improve some non-digit -> not is number. * Fix * Simplify. * Clang format, minor fixing of comments. --- diff --git a/src/theory/strings/theory_strings_rewriter.cpp b/src/theory/strings/theory_strings_rewriter.cpp index d475305fb..fd8f4a41b 100644 --- a/src/theory/strings/theory_strings_rewriter.cpp +++ b/src/theory/strings/theory_strings_rewriter.cpp @@ -1394,91 +1394,92 @@ RewriteResponse TheoryStringsRewriter::preRewrite(TNode node) { Node TheoryStringsRewriter::rewriteContains( Node node ) { if( node[0] == node[1] ){ return NodeManager::currentNM()->mkConst( true ); - }else if( node[0].isConst() && node[1].isConst() ){ + } + else if (node[0].isConst()) + { CVC4::String s = node[0].getConst(); - CVC4::String t = node[1].getConst(); - if( s.find(t) != std::string::npos ){ - return NodeManager::currentNM()->mkConst( true ); + if (node[1].isConst()) + { + CVC4::String t = node[1].getConst(); + return NodeManager::currentNM()->mkConst(s.find(t) != std::string::npos); }else{ - return NodeManager::currentNM()->mkConst( false ); - } - }else if( node[0].getKind()==kind::STRING_CONCAT ){ - //component-wise containment - unsigned n1 = node[0].getNumChildren(); - std::vector< Node > nc1; - getConcat( node[1], nc1 ); - unsigned n2 = nc1.size(); - if( n1>n2 ){ - unsigned diff = n1-n2; - for(unsigned i=0; imkConst( true ); - } + if (s.size() == 0) + { + return NodeManager::currentNM()->mkNode(kind::EQUAL, node[0], node[1]); + } + else if (node[1].getKind() == kind::STRING_CONCAT) + { + int firstc, lastc; + if (!canConstantContainConcat(node[0], node[1], firstc, lastc)) + { + return NodeManager::currentNM()->mkConst(false); } } } + } + std::vector nc1; + getConcat(node[0], nc1); + std::vector nc2; + getConcat(node[1], nc2); + std::vector nr; + + // component-wise containment + if (componentContains(nc1, nc2, nr) != -1) + { + return NodeManager::currentNM()->mkConst(true); + } + + // strip endpoints + std::vector nb; + std::vector ne; + if (stripConstantEndpoints(nc1, nc2, nb, ne)) + { + return NodeManager::currentNM()->mkNode( + kind::STRING_STRCTN, mkConcat(kind::STRING_CONCAT, nc1), node[1]); + } + Trace("strings-rewrite-debug2") << "No constant endpoints for " << node[0] + << " " << node[1] << std::endl; + + // splitting + if (node[0].getKind() == kind::STRING_CONCAT) + { if( node[1].isConst() ){ CVC4::String t = node[1].getConst(); - for(unsigned i=0; i(); - if( s.find(t)!=std::string::npos ) { - return NodeManager::currentNM()->mkConst( true ); - }else{ - //if no overlap, we can split into disjunction - // if first child, only require no left overlap - // if last child, only require no right overlap - if( i==0 || i==node[0].getNumChildren()-1 || t.find(s)==std::string::npos ){ - bool do_split = false; - int sot = s.overlap( t ); - Trace("strings-ext-rewrite") << "Overlap " << s << " " << t << " is " << sot << std::endl; - if( sot==0 ){ - do_split = i==0; - } - if( !do_split && ( i==(node[0].getNumChildren()-1) || sot==0 ) ){ - int tos = t.overlap( s ); - Trace("strings-ext-rewrite") << "Overlap " << t << " " << s << " is " << tos << std::endl; - if( tos==0 ){ - do_split = true; - } - } - if( do_split ){ - std::vector< Node > nc0; - getConcat( node[0], nc0 ); - std::vector< Node > spl[2]; - if( i>0 ){ - spl[0].insert( spl[0].end(), nc0.begin(), nc0.begin()+i ); - } - if( imkNode( kind::OR, - NodeManager::currentNM()->mkNode( kind::STRING_STRCTN, mkConcat( kind::STRING_CONCAT, spl[0] ), node[1] ), - NodeManager::currentNM()->mkNode( kind::STRING_STRCTN, mkConcat( kind::STRING_CONCAT, spl[1] ), node[1] ) ); - } - } + // if no overlap, we can split into disjunction + if (t.find(s) == std::string::npos && s.overlap(t) == 0 + && t.overlap(s) == 0) + { + std::vector nc0; + getConcat(node[0], nc0); + std::vector spl[2]; + spl[0].insert(spl[0].end(), nc0.begin(), nc0.begin() + i); + Assert(i < nc0.size() - 1); + spl[1].insert(spl[1].end(), nc0.begin() + i + 1, nc0.end()); + return NodeManager::currentNM()->mkNode( + kind::OR, + NodeManager::currentNM()->mkNode( + kind::STRING_STRCTN, + mkConcat(kind::STRING_CONCAT, spl[0]), + node[1]), + NodeManager::currentNM()->mkNode( + kind::STRING_STRCTN, + mkConcat(kind::STRING_CONCAT, spl[1]), + node[1])); } } } } - }else if( node[0].isConst() ){ - if( node[0].getConst().size()==0 ){ - return NodeManager::currentNM()->mkNode( kind::EQUAL, node[0], node[1] ); - }else if( node[1].getKind()==kind::STRING_CONCAT ){ - int firstc, lastc; - if( !canConstantContainConcat( node[0], node[1], firstc, lastc ) ){ - return NodeManager::currentNM()->mkConst( false ); - } - } } return node; } @@ -1754,3 +1755,287 @@ Node TheoryStringsRewriter::collectConstantStringAt( std::vector< Node >& vec, u } } +int TheoryStringsRewriter::componentContains(std::vector& n1, + std::vector& n2, + std::vector& nr, + bool computeRemainder) +{ + if (n2.size() == 1 && n2[0].isConst()) + { + CVC4::String t = n2[0].getConst(); + for (unsigned i = 0; i < n1.size(); i++) + { + if (n1[i].isConst()) + { + CVC4::String s = n1[i].getConst(); + size_t f = s.find(t); + if (f != std::string::npos) + { + if (computeRemainder) + { + Assert(s.size() >= f + t.size()); + if (s.size() > f + t.size()) + { + nr.push_back(NodeManager::currentNM()->mkConst(::CVC4::String( + s.substr(f + t.size(), s.size() - (f + t.size()))))); + } + nr.insert(nr.end(), n1.begin() + i + 1, n1.end()); + n1[i] = NodeManager::currentNM()->mkConst( + ::CVC4::String(s.substr(0, f + t.size()))); + n1.erase(n1.begin() + i + 1, n1.end()); + } + return i; + } + } + } + } + else if (n1.size() >= n2.size()) + { + unsigned diff = n1.size() - n2.size(); + for (unsigned i = 0; i <= diff; i++) + { + bool check = false; + if (n1[i] == n2[0]) + { + check = true; + } + else if (n1[i].isConst() && n2[0].isConst()) + { + // could be suffix + CVC4::String s = n1[i].getConst(); + CVC4::String t = n2[0].getConst(); + if (t.size() < s.size() && s.suffix(t.size()) == t) + { + check = true; + } + } + if (check) + { + bool success = true; + for (unsigned j = 1; j < n2.size(); j++) + { + if (n1[i + j] != n2[j]) + { + if (j + 1 == n2.size() && n1[i + j].isConst() && n2[j].isConst()) + { + // could be prefix + CVC4::String s = n1[i + j].getConst(); + CVC4::String t = n2[j].getConst(); + if (t.size() < s.size() && s.prefix(t.size()) == t) + { + if (computeRemainder) + { + if (s.size() > t.size()) + { + nr.push_back( + NodeManager::currentNM()->mkConst(::CVC4::String( + s.substr(t.size(), s.size() - t.size())))); + } + n1[i + j] = NodeManager::currentNM()->mkConst( + ::CVC4::String(s.substr(0, t.size()))); + } + } + else + { + success = false; + break; + } + } + else + { + success = false; + break; + } + } + } + if (success) + { + if (computeRemainder) + { + nr.insert(nr.end(), n1.begin() + i + n2.size(), n1.end()); + n1.erase(n1.begin() + i + n2.size(), n1.end()); + } + return i; + } + } + } + } + return -1; +} + +bool TheoryStringsRewriter::stripConstantEndpoints(std::vector& n1, + std::vector& n2, + std::vector& nb, + std::vector& ne, + int dir) +{ + bool changed = false; + // for ( forwards, backwards ) direction + for (unsigned r = 0; r < 2; r++) + { + if (dir == 0 || (r == 0 && dir == -1) || (r == 1 && dir == 1)) + { + unsigned index0 = r == 0 ? 0 : n1.size() - 1; + unsigned index1 = r == 0 ? 0 : n2.size() - 1; + bool removeComponent = false; + Trace("strings-rewrite-debug2") << "stripConstantEndpoints : Compare " + << n1[index0] << " " << n2[index1] + << ", dir = " << dir << std::endl; + if (n1[index0].isConst()) + { + CVC4::String s = n1[index0].getConst(); + // overlap is an overapproximation of the number of characters + // n2[index1] can match in s + unsigned overlap = s.size(); + if (n2[index1].isConst()) + { + CVC4::String t = n2[index1].getConst(); + std::size_t ret = s.find(t); + if (ret == std::string::npos) + { + if (n1.size() == 1) + { + // can remove everything + // e.g. str.contains( "abc", str.++( "ba", x ) ) --> + // str.contains( "", str.++( "ba", x ) ) + removeComponent = true; + } + else + { + // check how much overlap there is + // This is used to partially strip off the endpoint + // e.g. str.contains( str.++( "abc", x ), str.++( "cd", y ) ) --> + // str.contains( str.++( "c", x ), str.++( "cd", y ) ) + overlap = r == 0 ? s.overlap(t) : t.overlap(s); + } + } + else + { + Assert(ret < s.size()); + // can strip off up to the find position + // e.g. str.contains( str.++( "abc", x ), str.++( "b", y ) ) --> + // str.contains( str.++( "bc", x ), str.++( "b", y ) ) + overlap = s.size() - ret; + } + } + else if (n2[index1].getKind() == kind::STRING_ITOS) + { + const std::vector& svec = s.getVec(); + // can remove up to the first occurrence of a non-digit + for (unsigned i = 0; i < svec.size(); i++) + { + unsigned sindex = r == 0 ? i : svec.size() - i; + if (String::isDigit(svec[sindex])) + { + break; + } + else + { + // e.g. str.contains( str.++( "a", x ), str.to.int(y) ) --> + // str.contains( x, str.to.int(y) ) + overlap--; + } + } + } + else + { + // inconclusive + } + // process the overlap + if (overlap < s.size()) + { + changed = true; + if (overlap == 0) + { + removeComponent = true; + } + else + { + // can drop the prefix (resp. suffix) from the first (resp. last) + // component + if (r == 0) + { + nb.push_back( + NodeManager::currentNM()->mkConst(s.prefix(overlap))); + n1[index0] = NodeManager::currentNM()->mkConst(s.suffix(overlap)); + } + else + { + ne.push_back( + NodeManager::currentNM()->mkConst(s.suffix(overlap))); + n1[index0] = NodeManager::currentNM()->mkConst(s.prefix(overlap)); + } + } + } + } + else if (n1[index0].getKind() == kind::STRING_ITOS) + { + if (n2[index1].isConst()) + { + CVC4::String t = n2[index1].getConst(); + + if (n1.size() == 1) + { + // if n1.size()==1, then if n2[index1] is not a number, we can drop + // the entire component + // e.g. str.contains( str.to.int(x), "123a45") --> false + if (!t.isNumber()) + { + removeComponent = true; + } + } + else + { + const std::vector& tvec = t.getVec(); + Assert(tvec.size() > 0); + + // if n1.size()>1, then if the first (resp. last) character of + // n2[index1] + // is not a digit, we can drop the entire component, e.g.: + // str.contains( str.++( str.to.int(x), y ), "a12") --> + // str.contains( y, "a12" ) + // str.contains( str.++( y, str.to.int(x) ), "a0b") --> + // str.contains( y, "a0b" ) + unsigned i = r == 0 ? 0 : (tvec.size() - 1); + if (!String::isDigit(tvec[i])) + { + removeComponent = true; + } + } + } + } + if (removeComponent) + { + // can drop entire first (resp. last) component + if (r == 0) + { + nb.push_back(n1[index0]); + n1.erase(n1.begin(), n1.begin() + 1); + } + else + { + ne.push_back(n1[index0]); + n1.pop_back(); + } + if (n1.empty()) + { + // if we've removed everything, just return (we will rewrite to false) + return true; + } + else + { + changed = true; + } + } + } + } + // TODO (#1180) : computing the maximal overlap in this function may be + // important. + // str.contains( str.++( str.to.int(x), str.substr(y,0,3) ), "2aaaa" ) ---> + // false + // ...since str.to.int(x) can contain at most 1 character from "2aaaa", + // leaving 4 characters + // which is larger that the upper bound for length of str.substr(y,0,3), + // which is 3. + return changed; +} diff --git a/src/theory/strings/theory_strings_rewriter.h b/src/theory/strings/theory_strings_rewriter.h index d5a35926e..2dff1ad52 100644 --- a/src/theory/strings/theory_strings_rewriter.h +++ b/src/theory/strings/theory_strings_rewriter.h @@ -9,10 +9,8 @@ ** All rights reserved. See the file COPYING in the top-level source ** directory for licensing information.\endverbatim ** - ** \brief [[ Add one-line brief description here ]] + ** \brief Rewriter for the theory of strings ** - ** [[ Add lengthier description here ]] - ** \todo document this file **/ #include "cvc4_private.h" @@ -53,20 +51,118 @@ public: static inline void init() {} static inline void shutdown() {} - + /** rewrite contains + * This is the entry point for post-rewriting terms n of the form + * str.contains( t, s ) + * Returns the rewritten form of n. + * + * For details on some of the basic rewrites done in this function, see Figure + * 7 of Reynolds et al "Scaling Up DPLL(T) String Solvers Using + * Context-Dependent Rewriting", CAV 2017. + */ static Node rewriteContains( Node n ); static Node rewriteIndexof( Node n ); static Node rewriteReplace( Node n ); - + + /** gets the "vector form" of term n, adds it to c. + * For example: + * when n = str.++( x, y ), c is { x, y } + * when n = str.++( x, str.++( y, z ), w ), c is { x, str.++( y, z ), w ) + * when n = x, c is { x } + * + * Also applies to regular expressions (re.++ above). + */ static void getConcat( Node n, std::vector< Node >& c ); static Node mkConcat( Kind k, std::vector< Node >& c ); static Node splitConstant( Node a, Node b, int& index, bool isRev ); - /** return true if constant c can contain the concat n/list l in order - firstc/lastc store which indices were used */ - static bool canConstantContainConcat( Node c, Node n, int& firstc, int& lastc ); + /** can constant contain list + * return true if constant c can contain the list l in order + * firstc/lastc store which indices in l were used to determine the return + * value. + * (This is typically used when this function returns false, for minimizing + * explanations) + * + * For example: + * canConstantContainList( "abc", { x, "c", y } ) returns true + * firstc/lastc are updated to 1/1 + * canConstantContainList( "abc", { x, "d", y } ) returns false + * firstc/lastc are updated to 1/1 + * canConstantContainList( "abcdef", { x, "b", y, "a", z, "c", w } + * returns false + * firstc/lastc are updated to 1/3 + * canConstantContainList( "abcdef", { x, "b", y, "e", z, "c", w } + * returns false + * firstc/lastc are updated to 1/5 + */ static bool canConstantContainList( Node c, std::vector< Node >& l, int& firstc, int& lastc ); + /** can constant contain concat + * same as above but with n = str.++( l ) instead of l + */ + static bool canConstantContainConcat(Node c, Node n, int& firstc, int& lastc); static Node getNextConstantAt( std::vector< Node >& vec, unsigned& start_index, unsigned& end_index, bool isRev ); static Node collectConstantStringAt( std::vector< Node >& vec, unsigned& end_index, bool isRev ); + + /** component contains + * This function is used when rewriting str.contains( t1, t2 ), where + * n1 is the vector form of t1 + * n2 is the vector form of t2 + * + * if this function returns n>=0 for some n, then + * n1 = { x1...x{n-1} xn...x{n+s} x{n+s+1}...xm }, + * n2 = { y1...ys }, + * y1 is a suffix of xn, + * y2...y{s-1} = x{n+1}...x{n+s-1}, and + * ys is a prefix of x{n+s} + * if computeRemainder = true, then + * n1 is updated to { x1...x{n-1} xn... x{n+s-1} ys }, and + * nr is set to { ( x{n+s} \ ys ) x{n+s+1}...xm } + * where ( x{n+s} \ ys ) denotes string remainder (see operator "\" in + * Section 3.2 of Reynolds et al CAV 2017). + * otherwise it returns -1. + * + * For example: + * componentContains( { y, "abc", x, "def" }, { "c", x, "de" }, {}, true ) + * returns 1, + * n1 is updated to { y, "abc", x, "de" }, + * nr is updated to { "f" } + */ + static int componentContains(std::vector& n1, + std::vector& n2, + std::vector& nr, + bool computeRemainder = false); + /** strip constant endpoints + * This function is used when rewriting str.contains( t1, t2 ), where + * n1 is the vector form of t1 + * n2 is the vector form of t2 + * + * It modifies n1 to a new vector n1' such that: + * (1) str.contains( str.++( n1 ), str.++( n2 ) ) is equivalent to + * str.contains( str.++( n1' ), str.++( n2 ) ) + * (2) str.++( n1 ) = str.++( nb, n1', ne ) + * + * "dir" is the direction in which we can modify n1: + * if dir = 1, then we allow dropping components from the front of n1, + * if dir = -1, then we allow dropping components from the back of n1, + * if dir = 0, then we allow dropping components from either. + * + * It returns true if n1 is modified. + * + * For example: + * stripConstantEndpoints( { "ab", x, "de" }, { "c" }, {}, {}, 1 ) + * returns true, + * n1 is updated to { x, "de" } + * nb is updated to { "ab" } + * stripConstantEndpoints( { "ab", x, "de" }, { "bd" }, {}, {}, 0 ) + * returns true, + * n1 is updated to { "b", x, "d" } + * nb is updated to { "a" } + * ne is updated to { "e" } + */ + static bool stripConstantEndpoints(std::vector& n1, + std::vector& n2, + std::vector& nb, + std::vector& ne, + int dir = 0); };/* class TheoryStringsRewriter */ }/* CVC4::theory::strings namespace */ diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp index 681b574a3..d93c5426e 100644 --- a/src/util/regexp.cpp +++ b/src/util/regexp.cpp @@ -357,14 +357,20 @@ bool String::isNumber() const { return false; } for (unsigned character : d_str) { - unsigned char c = convertUnsignedIntToChar(character); - if (c < '0' || c > '9') { + if (!isDigit(character)) + { return false; } } return true; } +bool String::isDigit(unsigned character) +{ + unsigned char c = convertUnsignedIntToChar(character); + return c >= '0' && c <= '9'; +} + int String::toNumber() const { if (isNumber()) { int ret = 0; diff --git a/src/util/regexp.h b/src/util/regexp.h index 9d351dde4..d51ef4372 100644 --- a/src/util/regexp.h +++ b/src/util/regexp.h @@ -127,15 +127,41 @@ class CVC4_PUBLIC String { String prefix(std::size_t i) const { return substr(0, i); } String suffix(std::size_t i) const { return substr(size() - i, i); } - // if y=y1...yn and overlap returns m, then this is x1...y1...ym + /** string overlap + * + * if overlap returns m>0, + * then the maximal suffix of this string that is a prefix of y is of length m. + * + * For example, if x is "abcdef", then: + * x.overlap("defg") = 3 + * x.overlap("ab") = 0 + * x.overlap("d") = 0 + * x.overlap("bcdefdef") = 5 + */ std::size_t overlap(const String& y) const; - // if y=y1...yn and overlap returns m, then this is y(n+1-m)...yn...xk + /** string reverse overlap + * + * if roverlap returns m>0, + * then the maximal prefix of this string that is a suffix of y is of length m. + * + * For example, if x is "abcdef", then: + * x.roverlap("aaabc") = 3 + * x.roverlap("def") = 0 + * x.roverlap("d") = 0 + * x.roverlap("defabcde") = 5 + * + * Notice that x.overlap(y) = y.roverlap(x) + */ std::size_t roverlap(const String& y) const; bool isNumber() const; int toNumber() const; const std::vector& getVec() const { return d_str; } + /** is the unsigned a digit? + * The input should be the same type as the element type of d_str + */ + static bool isDigit(unsigned character); private: // guarded diff --git a/test/regress/regress0/strings/Makefile.am b/test/regress/regress0/strings/Makefile.am index 6cdba7c9d..e27a1bfcd 100644 --- a/test/regress/regress0/strings/Makefile.am +++ b/test/regress/regress0/strings/Makefile.am @@ -89,7 +89,8 @@ TESTS = \ repl-empty-sem.smt2 \ bug799-min.smt2 \ strings-charat.cvc \ - issue1105.smt2 + issue1105.smt2 \ + issue1189.smt2 FAILING_TESTS = diff --git a/test/regress/regress0/strings/issue1189.smt2 b/test/regress/regress0/strings/issue1189.smt2 new file mode 100644 index 000000000..fae641ea8 --- /dev/null +++ b/test/regress/regress0/strings/issue1189.smt2 @@ -0,0 +1,6 @@ +(set-logic ALL_SUPPORTED) +(set-info :status unsat) +(set-option :strings-exp true) +(declare-const x Int) +(assert (str.contains (str.++ "some text" (int.to.str x) "tor") "vector")) +(check-sat)