From 9a3adaec00e4d36619a2eb78756914b22cde2a36 Mon Sep 17 00:00:00 2001 From: Andrew Reynolds Date: Thu, 18 Oct 2018 07:37:43 -0500 Subject: [PATCH] Constant length regular expression elimination (#2646) --- src/theory/strings/regexp_elim.cpp | 82 ++++++++++++++++++- .../strings/theory_strings_rewriter.cpp | 53 ++++++++++++ src/theory/strings/theory_strings_rewriter.h | 6 ++ test/regress/CMakeLists.txt | 1 + test/regress/regress2/strings/issue918.smt2 | 30 +++++++ 5 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 test/regress/regress2/strings/issue918.smt2 diff --git a/src/theory/strings/regexp_elim.cpp b/src/theory/strings/regexp_elim.cpp index 0310e4620..a0d806c52 100644 --- a/src/theory/strings/regexp_elim.cpp +++ b/src/theory/strings/regexp_elim.cpp @@ -50,11 +50,89 @@ Node RegExpElimination::eliminateConcat(Node atom) Node x = atom[0]; Node lenx = nm->mkNode(STRING_LENGTH, x); Node re = atom[1]; + std::vector children; + TheoryStringsRewriter::getConcat(re, children); + + // If it can be reduced to memberships in fixed length regular expressions. + // This includes concatenations where at most one child is of the form + // (re.* re.allchar), which we abbreviate _* below, and all other children + // have a fixed length. + // The intuition why this is a "non-aggressive" rewrite is that membership + // into fixed length regular expressions are easy to handle. + bool hasFixedLength = true; + // the index of _* in re + unsigned pivotIndex = 0; + bool hasPivotIndex = false; + std::vector childLengths; + std::vector childLengthsPostPivot; + for (unsigned i = 0, size = children.size(); i < size; i++) + { + Node c = children[i]; + Node fl = TheoryStringsRewriter::getFixedLengthForRegexp(c); + if (fl.isNull()) + { + if (!hasPivotIndex && c.getKind() == REGEXP_STAR + && c[0].getKind() == REGEXP_SIGMA) + { + hasPivotIndex = true; + pivotIndex = i; + // set to zero for the sum below + fl = d_zero; + } + else + { + hasFixedLength = false; + break; + } + } + childLengths.push_back(fl); + if (hasPivotIndex) + { + childLengthsPostPivot.push_back(fl); + } + } + if (hasFixedLength) + { + Assert(re.getNumChildren() == children.size()); + Node sum = nm->mkNode(PLUS, childLengths); + std::vector conc; + conc.push_back(nm->mkNode(hasPivotIndex ? GEQ : EQUAL, lenx, sum)); + Node currEnd = d_zero; + for (unsigned i = 0, size = childLengths.size(); i < size; i++) + { + if (hasPivotIndex && i == pivotIndex) + { + Node ppSum = childLengthsPostPivot.size() == 1 + ? childLengthsPostPivot[0] + : nm->mkNode(PLUS, childLengthsPostPivot); + currEnd = nm->mkNode(MINUS, lenx, ppSum); + } + else + { + Node curr = nm->mkNode(STRING_SUBSTR, x, currEnd, childLengths[i]); + Node currMem = nm->mkNode(STRING_IN_REGEXP, curr, re[i]); + conc.push_back(currMem); + currEnd = nm->mkNode(PLUS, currEnd, childLengths[i]); + currEnd = Rewriter::rewrite(currEnd); + } + } + Node res = nm->mkNode(AND, conc); + // For example: + // x in re.++(re.union(re.range("A", "J"), re.range("N", "Z")), "AB") --> + // len( x ) = 3 ^ + // substr(x,0,1) in re.union(re.range("A", "J"), re.range("N", "Z")) ^ + // substr(x,1,2) in "AB" + // An example with a pivot index: + // x in re.++( "AB" ++ _* ++ "C" ) --> + // len( x ) >= 3 ^ + // substr( x, 0, 2 ) in "AB" ^ + // substr( x, len( x ) - 1, 1 ) in "C" + return returnElim(atom, res, "concat-fixed-len"); + } + // memberships of the form x in re.++ * s1 * ... * sn *, where * are // any number of repetitions (exact or indefinite) of re.allchar. Trace("re-elim-debug") << "Try re concat with gaps " << atom << std::endl; - std::vector children; - TheoryStringsRewriter::getConcat(re, children); bool onlySigmasAndConsts = true; std::vector sep_children; std::vector gap_minsize; diff --git a/src/theory/strings/theory_strings_rewriter.cpp b/src/theory/strings/theory_strings_rewriter.cpp index e8a11e62e..c38a5e838 100644 --- a/src/theory/strings/theory_strings_rewriter.cpp +++ b/src/theory/strings/theory_strings_rewriter.cpp @@ -4538,6 +4538,59 @@ Node TheoryStringsRewriter::getConstantArithBound(Node a, bool isLower) return ret; } +Node TheoryStringsRewriter::getFixedLengthForRegexp(Node n) +{ + NodeManager* nm = NodeManager::currentNM(); + if (n.getKind() == STRING_TO_REGEXP) + { + Node ret = nm->mkNode(STRING_LENGTH, n[0]); + ret = Rewriter::rewrite(ret); + if (ret.isConst()) + { + return ret; + } + } + else if (n.getKind() == REGEXP_SIGMA || n.getKind() == REGEXP_RANGE) + { + return nm->mkConst(Rational(1)); + } + else if (n.getKind() == REGEXP_UNION || n.getKind() == REGEXP_INTER) + { + Node ret; + for (const Node& nc : n) + { + Node flc = getFixedLengthForRegexp(nc); + if (flc.isNull() || (!ret.isNull() && ret != flc)) + { + return Node::null(); + } + else if (ret.isNull()) + { + // first time + ret = flc; + } + } + return ret; + } + else if (n.getKind() == REGEXP_CONCAT) + { + NodeBuilder<> nb(PLUS); + for (const Node& nc : n) + { + Node flc = getFixedLengthForRegexp(nc); + if (flc.isNull()) + { + return flc; + } + nb << flc; + } + Node ret = nb.constructNode(); + ret = Rewriter::rewrite(ret); + return ret; + } + return Node::null(); +} + bool TheoryStringsRewriter::checkEntailArithInternal(Node a) { Assert(Rewriter::rewrite(a) == a); diff --git a/src/theory/strings/theory_strings_rewriter.h b/src/theory/strings/theory_strings_rewriter.h index 2c38ce8dc..3d71423ab 100644 --- a/src/theory/strings/theory_strings_rewriter.h +++ b/src/theory/strings/theory_strings_rewriter.h @@ -566,6 +566,12 @@ class TheoryStringsRewriter { * checkEntailArith( a, strict ) = true. */ static Node getConstantArithBound(Node a, bool isLower = true); + /** get length for regular expression + * + * Given regular expression n, if this method returns a non-null value c, then + * x in n entails len( x ) = c. + */ + static Node getFixedLengthForRegexp(Node n); /** decompose substr chain * * If s is substr( ... substr( base, x1, y1 ) ..., xn, yn ), then this diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt index aef40d373..cd95c6653 100644 --- a/test/regress/CMakeLists.txt +++ b/test/regress/CMakeLists.txt @@ -1713,6 +1713,7 @@ set(regress_2_tests regress2/strings/cmu-disagree-0707-dd.smt2 regress2/strings/cmu-prereg-fmf.smt2 regress2/strings/cmu-repl-len-nterm.smt2 + regress2/strings/issue918.smt2 regress2/strings/non_termination_regular_expression6.smt2 regress2/strings/norn-dis-0707-3.smt2 regress2/strings/repl-repl.smt2 diff --git a/test/regress/regress2/strings/issue918.smt2 b/test/regress/regress2/strings/issue918.smt2 new file mode 100644 index 000000000..8f390df26 --- /dev/null +++ b/test/regress/regress2/strings/issue918.smt2 @@ -0,0 +1,30 @@ +; COMMAND-LINE: --strings-exp --re-elim +; EXPECT: sat +(set-option :produce-models true) +(set-logic UFDTSLIA) +(set-info :smt-lib-version 2.5) +(declare-datatypes () ( + (StringRotation (StringRotation$C_StringRotation (StringRotation$C_StringRotation$sr String))) + (StringRotation2 (StringRotation2$C_StringRotation2 (StringRotation2$C_StringRotation2$sr1 StringRotation) (StringRotation2$C_StringRotation2$sr2 StringRotation))) +) ) + +(define-fun f1005$isValid_string((x$$1008 String)) Bool true) +(define-fun f1035$isValid_StringRotation((x$$1038 StringRotation)) Bool (and (f1005$isValid_string (StringRotation$C_StringRotation$sr x$$1038)) (or (or (or (= (StringRotation$C_StringRotation$sr x$$1038) "0 deg") (= (StringRotation$C_StringRotation$sr x$$1038) "90 deg")) (= (StringRotation$C_StringRotation$sr x$$1038) "180 deg")) (= (StringRotation$C_StringRotation$sr x$$1038) "270 deg")))) +(define-fun f1121$isValid_StringRotation2((x$$1124 StringRotation2)) Bool (and (f1035$isValid_StringRotation (StringRotation2$C_StringRotation2$sr1 x$$1124)) (f1035$isValid_StringRotation (StringRotation2$C_StringRotation2$sr2 x$$1124)))) + +(declare-fun $OutSR$1356$3$1$() StringRotation2) +(assert (f1121$isValid_StringRotation2 $OutSR$1356$3$1$)) +(assert (and (is-StringRotation2$C_StringRotation2 $OutSR$1356$3$1$) (and (and +(is-StringRotation$C_StringRotation (StringRotation2$C_StringRotation2$sr1 $OutSR$1356$3$1$)) (or + +(str.in.re +(StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr1 $OutSR$1356$3$1$)) +(re.++ (re.union (re.range "u" "~") (re.range " " "t") ) (re.union (re.range "K" "~") (re.range " " "J") ) (re.union (re.range "L" "~") (re.range " " "K") ) (re.union (re.range "y" "~") (re.range " " "x") ) (re.union (re.range "{" "~") (re.range " " "z") ) ) +) +(or +(str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr1 $OutSR$1356$3$1$)) +(re.++ (re.union (re.range "s" "~") (re.range " " "r") ) (re.union (re.range "{" "~") (re.range " " "z") ) (re.union (re.range "u" "~") (re.range " " "t") ) ) ) + +(or +(str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr1 $OutSR$1356$3$1$)) (re.union (re.range "<" "~") (re.range " " ";") ) ) (or (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr1 $OutSR$1356$3$1$)) (re.++ (re.union (re.range "&" "~") (re.range " " "%") ) (re.range " " "~") ) ) (or (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr1 $OutSR$1356$3$1$)) (re.++ (re.union (re.range "`" "~") (re.range " " "_") ) (re.union (re.range "s" "~") (re.range " " "r") ) (re.union (re.range "1" "~") (re.range " " "0") ) (re.union (re.range "_" "~") (re.range " " "^") ) ) ) (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr1 $OutSR$1356$3$1$)) (re.* (re.union (re.range "\x00" "\x09") (re.range "\x0B" "\x0C") (re.range "\x0E" "\x7F") ) ) ))))))) (and (is-StringRotation$C_StringRotation (StringRotation2$C_StringRotation2$sr2 $OutSR$1356$3$1$)) (or (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr2 $OutSR$1356$3$1$)) (re.++ (re.union (re.range "<" "~") (re.range " " ";") ) (re.union (re.range "F" "~") (re.range " " "E") ) (re.union (re.range "u" "~") (re.range " " "t") ) (re.union (re.range "P" "~") (re.range " " "O") ) ) ) (or (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr2 $OutSR$1356$3$1$)) (re.union (re.range "E" "~") (re.range " " "D") ) ) (or (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr2 $OutSR$1356$3$1$)) (re.++ (re.union (re.range "x" "~") (re.range " " "w") ) (re.union (re.range "n" "~") (re.range " " "m") ) (re.union (re.range "d" "~") (re.range " " "c") ) (re.union (re.range "]" "~") (re.range " " "\\") ) (re.union (re.range "\\" "~") (re.range " " "[") ) ) ) (or (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr2 $OutSR$1356$3$1$)) (re.++ (re.union (re.range ":" "~") (re.range " " "9") ) (re.union (re.range "+" "~") (re.range " " "*") ) ) ) (or (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr2 $OutSR$1356$3$1$)) (re.++ (re.union (re.range "." "~") (re.range " " "-") ) (re.union (re.range "n" "~") (re.range " " "m") ) (re.union (re.range "|" "~") (re.range " " "{") ) ) ) (str.in.re (StringRotation$C_StringRotation$sr (StringRotation2$C_StringRotation2$sr2 $OutSR$1356$3$1$)) (re.* (re.union (re.range "\x00" "\x09") (re.range "\x0B" "\x0C") (re.range "\x0E" "\x7F") ) ) )))))))))) +(check-sat) -- 2.30.2