From: Andres Noetzli Date: Fri, 30 Aug 2019 02:38:17 +0000 (-0700) Subject: Infer conflicts based on regular expression inclusion (#3234) X-Git-Tag: cvc5-1.0.0~3987 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=974fc1d23c2b6091c26cf316964c4c16c5e2733f;p=cvc5.git Infer conflicts based on regular expression inclusion (#3234) We have a conflict if we have `str.in.re(x, R1)` and `~str.in.re(x, R2)` and `R2` includes `R1` because there is no possible value for `x` that satisfies both memberships. This commit adds code to detect regular expression inclusion for a small fragment of regular expressions: string literals with single char (`re.allchar`) and multichar wildcards (`re.*(re.allchar)`). Signed-off-by: Andres Noetzli --- diff --git a/cmake/FindCxxTest.cmake b/cmake/FindCxxTest.cmake index cd7aed70d..19fd7f881 100644 --- a/cmake/FindCxxTest.cmake +++ b/cmake/FindCxxTest.cmake @@ -26,7 +26,7 @@ endif() if(PYTHONINTERP_FOUND AND CxxTest_PYTHON_TESTGEN_EXECUTABLE) set(CxxTest_TESTGEN_EXECUTABLE ${CxxTest_PYTHON_TESTGEN_EXECUTABLE}) - set(CxxTest_TESTGEN_INTERPRETER ${PYTHON_EXECUTABLE}) + # set(CxxTest_TESTGEN_INTERPRETER ${PYTHON_EXECUTABLE}) elseif(PERL_FOUND AND CxxTest_PERL_TESTGEN_EXECUTABLE) set(CxxTest_TESTGEN_EXECUTABLE ${CxxTest_PERL_TESTGEN_EXECUTABLE}) set(CxxTest_TESTGEN_INTERPRETER ${PERL_EXECUTABLE}) diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp index 7286e2fb4..b410670e5 100644 --- a/src/theory/strings/regexp_operation.cpp +++ b/src/theory/strings/regexp_operation.cpp @@ -19,6 +19,7 @@ #include "expr/kind.h" #include "options/strings_options.h" #include "theory/strings/theory_strings_rewriter.h" +#include "theory/strings/theory_strings_utils.h" using namespace CVC4; using namespace CVC4::kind; @@ -1652,6 +1653,115 @@ std::string RegExpOpr::mkString( Node r ) { return retStr; } +bool RegExpOpr::regExpIncludes(Node r1, Node r2) +{ + Assert(Rewriter::rewrite(r1) == r1); + Assert(Rewriter::rewrite(r2) == r2); + + if (r1 == r2) + { + return true; + } + + // This method only works on a fragment of regular expressions + if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2)) + { + return false; + } + + const auto& it = d_inclusionCache.find(std::make_pair(r1, r2)); + if (it != d_inclusionCache.end()) + { + return (*it).second; + } + + std::vector v1, v2; + utils::getRegexpComponents(r1, v1); + utils::getRegexpComponents(r2, v2); + + // In the following, we iterate over `r2` (the "includee") and try to + // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible + // positions in `r1` that we could currently be at. + std::unordered_set newIdxs = {0}; + std::unordered_set idxs; + for (const Node& n2 : v2) + { + // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are + // removed and for (re.* re.allchar), we additionally include the + // option of skipping it. + idxs.clear(); + for (size_t idx : newIdxs) + { + if (idx < v1.size()) + { + idxs.insert(idx); + if (v1[idx] == d_sigma_star) + { + Assert(idx + 1 == v1.size() || v1[idx + 1] != d_sigma_star); + idxs.insert(idx + 1); + } + } + } + newIdxs.clear(); + + if (n2.getKind() == STRING_TO_REGEXP || n2 == d_sigma) + { + Assert(n2 == d_sigma + || (n2[0].isConst() && n2[0].getConst().size() == 1)); + for (size_t idx : idxs) + { + if (v1[idx] == d_sigma || v1[idx] == n2) + { + // Given a character or an re.allchar in `r2`, we can either + // match it with a corresponding character in `r1` or an + // re.allchar + newIdxs.insert(idx + 1); + } + } + } + + for (size_t idx : idxs) + { + if (v1[idx] == d_sigma_star) + { + // (re.* re.allchar) can match an arbitrary amount of `r2` + newIdxs.insert(idx); + } + else if (utils::isUnboundedWildcard(v1, idx)) + { + // If a series of re.allchar is followed by (re.* re.allchar), we + // can decide not to "waste" the re.allchar because the order of + // the two wildcards is not observable (i.e. it does not change + // the sequences matched by the regular expression) + newIdxs.insert(idx); + } + } + + if (newIdxs.empty()) + { + // If there are no candidates, we can't match the remainder of r2 + d_inclusionCache[std::make_pair(r1, r2)] = false; + return false; + } + } + + // We have processed all of `r2`. We are now looking if there was also a + // path to the end in `r1`. This makes sure that we don't have leftover + // bits in `r1` that don't match anything in `r2`. + bool result = false; + for (size_t idx : newIdxs) + { + if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == d_sigma_star)) + { + result = true; + break; + } + } + + d_inclusionCache[std::make_pair(r1, r2)] = result; + return result; +} + }/* CVC4::theory::strings namespace */ }/* CVC4::theory namespace */ }/* CVC4 namespace */ diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h index cb35b37c6..117449d35 100644 --- a/src/theory/strings/regexp_operation.h +++ b/src/theory/strings/regexp_operation.h @@ -85,6 +85,7 @@ class RegExpOpr { std::map d_rm_inter_cache; std::map d_norv_cache; std::map > d_split_cache; + std::map d_inclusionCache; void simplifyPRegExp(Node s, Node r, std::vector &new_nodes); void simplifyNRegExp(Node s, Node r, std::vector &new_nodes); /** @@ -129,6 +130,21 @@ class RegExpOpr { Node intersect(Node r1, Node r2, bool &spflag); /** Get the pretty printed version of the regular expression r */ static std::string mkString(Node r); + + /** + * Returns true if we can show that the regular expression `r1` includes + * the regular expression `r2` (i.e. `r1` matches a superset of sequences + * that `r2` matches). This method only works on a fragment of regular + * expressions, specifically regular expressions that pass the + * `isSimpleRegExp` check. + * + * @param r1 The regular expression that may include `r2` (must be in + * rewritten form) + * @param r2 The regular expression that may be included by `r1` (must be + * in rewritten form) + * @return True if the inclusion can be shown, false otherwise + */ + bool regExpIncludes(Node r1, Node r2); }; }/* CVC4::theory::strings namespace */ diff --git a/src/theory/strings/regexp_solver.cpp b/src/theory/strings/regexp_solver.cpp index ea38c4dd9..db4c9012c 100644 --- a/src/theory/strings/regexp_solver.cpp +++ b/src/theory/strings/regexp_solver.cpp @@ -65,9 +65,15 @@ void RegExpSolver::check(const std::map >& mems) Trace("regexp-process") << "Checking Memberships ... " << std::endl; for (const std::pair >& mr : mems) { + std::vector mems = mr.second; Trace("regexp-process") << "Memberships(" << mr.first << ") = " << mr.second << std::endl; - if (!checkEqcIntersect(mr.second)) + if (!checkEqcInclusion(mems)) + { + // conflict discovered, return + return; + } + if (!checkEqcIntersect(mems)) { // conflict discovered, return return; @@ -271,6 +277,90 @@ void RegExpSolver::check(const std::map >& mems) } } +bool RegExpSolver::checkEqcInclusion(std::vector& mems) +{ + std::unordered_set remove; + + for (const Node& m1 : mems) + { + bool m1Neg = m1.getKind() == NOT; + Node m1Lit = m1Neg ? m1[0] : m1; + + if (remove.find(m1) != remove.end()) + { + // Skip memberships marked for removal + continue; + } + + for (const Node& m2 : mems) + { + if (m1 == m2) + { + continue; + } + + bool m2Neg = m2.getKind() == NOT; + Node m2Lit = m2Neg ? m2[0] : m2; + + // Both regular expression memberships have the same polarity + if (m1Neg == m2Neg) + { + if (d_regexp_opr.regExpIncludes(m1Lit[1], m2Lit[1])) + { + if (m1Neg) + { + // ~str.in.re(x, R1) includes ~str.in.re(x, R2) ---> + // mark ~str.in.re(x, R2) as reduced + d_parent.getExtTheory()->markReduced(m2Lit); + remove.insert(m2); + } + else + { + // str.in.re(x, R1) includes str.in.re(x, R2) ---> + // mark str.in.re(x, R1) as reduced + d_parent.getExtTheory()->markReduced(m1Lit); + remove.insert(m1); + + // We don't need to process m1 anymore + break; + } + } + } + else + { + Node pos = m1Neg ? m2Lit : m1Lit; + Node neg = m1Neg ? m1Lit : m2Lit; + if (d_regexp_opr.regExpIncludes(neg[1], pos[1])) + { + // We have a conflict because we have a case where str.in.re(x, R1) + // and ~str.in.re(x, R2) but R2 includes R1, so there is no + // possible value for x that satisfies both memberships. + std::vector vec_nodes; + vec_nodes.push_back(pos); + vec_nodes.push_back(neg.negate()); + + if (pos[0] != neg[0]) + { + vec_nodes.push_back(pos[0].eqNode(neg[0])); + } + + Node conc; + d_im.sendInference(vec_nodes, conc, "Intersect inclusion", true); + return false; + } + } + } + } + + mems.erase(std::remove_if( + mems.begin(), + mems.end(), + [&remove](Node& n) { return remove.find(n) != remove.end(); }), + mems.end()); + + return true; +} + bool RegExpSolver::checkEqcIntersect(const std::vector& mems) { // do not compute intersections if the re intersection mode is none diff --git a/src/theory/strings/regexp_solver.h b/src/theory/strings/regexp_solver.h index f3abb2a1d..c4d6afda0 100644 --- a/src/theory/strings/regexp_solver.h +++ b/src/theory/strings/regexp_solver.h @@ -63,6 +63,23 @@ class RegExpSolver void check(const std::map>& mems); private: + /** + * Check memberships in equivalence class for regular expression + * inclusion. + * + * This method returns false if it discovered a conflict for this set of + * assertions, and true otherwise. It discovers a conflict e.g. if mems + * contains str.in.re(xi, Ri) and ~str.in.re(xj, Rj) and Rj includes Ri. + * + * @param mems Vector of memberships of the form: (~)str.in.re(x1, R1) + * ... (~)str.in.re(xn, Rn) where x1 = ... = xn in the + * current context. The function removes elements from this + * vector that were marked as reduced. + * @param expForRe Additional explanations for regular expressions. + * @return False if a conflict was detected, true otherwise + */ + bool checkEqcInclusion(std::vector& mems); + /** * Check memberships for equivalence class. * The vector mems is a vector of memberships of the form: diff --git a/src/theory/strings/theory_strings_utils.cpp b/src/theory/strings/theory_strings_utils.cpp index d3ae5384e..d764b87c1 100644 --- a/src/theory/strings/theory_strings_utils.cpp +++ b/src/theory/strings/theory_strings_utils.cpp @@ -134,6 +134,73 @@ Node getConstantEndpoint(Node e, bool isSuf) return getConstantComponent(e); } +bool isUnboundedWildcard(const std::vector& rs, size_t start) +{ + size_t i = start; + while (i < rs.size() && rs[i].getKind() == REGEXP_SIGMA) + { + i++; + } + + if (i >= rs.size()) + { + return false; + } + + return rs[i].getKind() == REGEXP_STAR && rs[i][0].getKind() == REGEXP_SIGMA; +} + +bool isSimpleRegExp(Node r) +{ + Assert(r.getType().isRegExp()); + + std::vector v; + utils::getConcat(r, v); + for (const Node& n : v) + { + if (n.getKind() == STRING_TO_REGEXP) + { + if (!n[0].isConst()) + { + return false; + } + } + else if (n.getKind() != REGEXP_SIGMA + && (n.getKind() != REGEXP_STAR || n[0].getKind() != REGEXP_SIGMA)) + { + return false; + } + } + return true; +} + +void getRegexpComponents(Node r, std::vector& result) +{ + Assert(r.getType().isRegExp()); + + NodeManager* nm = NodeManager::currentNM(); + if (r.getKind() == REGEXP_CONCAT) + { + for (const Node& n : r) + { + getRegexpComponents(n, result); + } + } + else if (r.getKind() == STRING_TO_REGEXP && r[0].isConst()) + { + String s = r[0].getConst(); + for (size_t i = 0, size = s.size(); i < size; i++) + { + result.push_back( + nm->mkNode(STRING_TO_REGEXP, nm->mkConst(s.substr(i, 1)))); + } + } + else + { + result.push_back(r); + } +} + } // namespace utils } // namespace strings } // namespace theory diff --git a/src/theory/strings/theory_strings_utils.h b/src/theory/strings/theory_strings_utils.h index 57d882625..cadc98df3 100644 --- a/src/theory/strings/theory_strings_utils.h +++ b/src/theory/strings/theory_strings_utils.h @@ -78,6 +78,39 @@ Node getConstantComponent(Node t); */ Node getConstantEndpoint(Node e, bool isSuf); +/** + * Given a vector of regular expression nodes and a start index that points to + * a wildcard, returns true if the wildcard is unbounded (i.e. it is followed + * by an arbitrary number of `re.allchar`s and then an `re.*(re.allchar)`. If + * the start index is not a wilcard or the wildcards are not followed by + * `re.*(re.allchar)`, the function returns false. + * + * @param rs The vector of regular expression nodes + * @param start The start index to consider + * @return True if the wilcard pointed to by `start` is unbounded, false + * otherwise + */ +bool isUnboundedWildcard(const std::vector& rs, size_t start); + +/** + * Returns true iff the given regular expression only consists of re.++, + * re.allchar, (re.* re.allchar), and str.to.re of string literals. + * + * @param r The regular expression to check + * @return True if the regular expression is simple + */ +bool isSimpleRegExp(Node r); + +/** + * Helper function that takes a regular expression concatenation and + * returns the components of the concatenation. Letters of string literals + * are treated as individual components. + * + * @param r The regular expression + * @param result The resulting components + */ +void getRegexpComponents(Node r, std::vector& result); + } // namespace utils } // namespace strings } // namespace theory diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt index 80b7c33bf..444a740ea 100644 --- a/test/regress/CMakeLists.txt +++ b/test/regress/CMakeLists.txt @@ -862,6 +862,8 @@ set(regress_0_tests regress0/strings/norn-31.smt2 regress0/strings/norn-simp-rew.smt2 regress0/strings/re.all.smt2 + regress0/strings/regexp_inclusion.smt2 + regress0/strings/regexp_inclusion_reduction.smt2 regress0/strings/repl-rewrites2.smt2 regress0/strings/replace-const.smt2 regress0/strings/replaceall-eval.smt2 diff --git a/test/regress/regress0/strings/regexp_inclusion.smt2 b/test/regress/regress0/strings/regexp_inclusion.smt2 new file mode 100644 index 000000000..c5c68a408 --- /dev/null +++ b/test/regress/regress0/strings/regexp_inclusion.smt2 @@ -0,0 +1,13 @@ +; COMMAND-LINE: --strings-exp --no-re-elim +(set-info :status unsat) +(set-logic ALL) +(declare-const actionName String) + +(assert +(str.in.re actionName (re.++ (str.to.re "wiz") (re.* re.allchar) (str.to.re "foobar") (re.* re.allchar) (str.to.re "baz/") (re.* re.allchar)))) + +(assert (not +(str.in.re actionName (re.++ (str.to.re "wiz") (re.* re.allchar) (re.++ (str.to.re "foo") re.allchar (str.to.re "ar")) (re.* re.allchar) (str.to.re "baz/") (re.* re.allchar))) +)) + +(check-sat) diff --git a/test/regress/regress0/strings/regexp_inclusion_reduction.smt2 b/test/regress/regress0/strings/regexp_inclusion_reduction.smt2 new file mode 100644 index 000000000..c10c287bb --- /dev/null +++ b/test/regress/regress0/strings/regexp_inclusion_reduction.smt2 @@ -0,0 +1,14 @@ +; COMMAND-LINE: --strings-exp --no-re-elim +(set-info :status sat) +(set-logic QF_SLIA) +(declare-const x String) +(declare-const y String) + +(assert (str.in.re x (re.++ (str.to.re "bar") (re.* re.allchar) (str.to.re "bar")))) +(assert (str.in.re x (re.++ (str.to.re "ba") re.allchar (re.* re.allchar) (str.to.re "bar")))) + +(assert (not (str.in.re y (re.++ (str.to.re "bar") (re.* re.allchar) (str.to.re "bar"))))) +(assert (not (str.in.re y (re.++ (str.to.re "ba") re.allchar (re.* re.allchar) (str.to.re "bar"))))) +(assert (not (= y ""))) + +(check-sat) diff --git a/test/unit/theory/CMakeLists.txt b/test/unit/theory/CMakeLists.txt index 963466b09..02ca5d8b5 100644 --- a/test/unit/theory/CMakeLists.txt +++ b/test/unit/theory/CMakeLists.txt @@ -1,3 +1,4 @@ +cvc4_add_unit_test_black(regexp_operation_black theory) cvc4_add_unit_test_black(theory_black theory) cvc4_add_unit_test_white(evaluator_white theory) cvc4_add_unit_test_white(logic_info_white theory) diff --git a/test/unit/theory/regexp_operation_black.h b/test/unit/theory/regexp_operation_black.h new file mode 100644 index 000000000..81f5cf34a --- /dev/null +++ b/test/unit/theory/regexp_operation_black.h @@ -0,0 +1,152 @@ +/********************* */ +/*! \file regexp_operation_black.h + ** \verbatim + ** Top contributors (to current version): + ** Andres Noetzli + ** This file is part of the CVC4 project. + ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS + ** in the top-level source directory) and their institutional affiliations. + ** All rights reserved. See the file COPYING in the top-level source + ** directory for licensing information.\endverbatim + ** + ** \brief Unit tests for symbolic regular expression operations + ** + ** Unit tests for symbolic regular expression operations. + **/ + +#include "expr/node.h" +#include "expr/node_manager.h" +#include "smt/smt_engine.h" +#include "smt/smt_engine_scope.h" +#include "theory/strings/regexp_operation.h" + +#include +#include +#include +#include + +using namespace CVC4; +using namespace CVC4::kind; +using namespace CVC4::smt; +using namespace CVC4::theory; +using namespace CVC4::theory::strings; + +class RegexpOperationBlack : public CxxTest::TestSuite +{ + public: + RegexpOperationBlack() {} + + void setUp() override + { + Options opts; + opts.setOutputLanguage(language::output::LANG_SMTLIB_V2); + d_em = new ExprManager(opts); + d_smt = new SmtEngine(d_em); + d_scope = new SmtScope(d_smt); + d_regExpOpr = new RegExpOpr(); + + d_nm = NodeManager::currentNM(); + } + + void tearDown() override + { + delete d_regExpOpr; + delete d_scope; + delete d_smt; + delete d_em; + } + + void includes(Node r1, Node r2) + { + r1 = Rewriter::rewrite(r1); + r2 = Rewriter::rewrite(r2); + std::cout << r1 << " includes " << r2 << std::endl; + TS_ASSERT(d_regExpOpr->regExpIncludes(r1, r2)); + } + + void doesNotInclude(Node r1, Node r2) + { + r1 = Rewriter::rewrite(r1); + r2 = Rewriter::rewrite(r2); + std::cout << r1 << " does not include " << r2 << std::endl; + TS_ASSERT(!d_regExpOpr->regExpIncludes(r1, r2)); + } + + void testBasic() + { + Node sigma = d_nm->mkNode(REGEXP_SIGMA, std::vector{}); + Node sigmaStar = d_nm->mkNode(REGEXP_STAR, sigma); + Node a = d_nm->mkNode(STRING_TO_REGEXP, d_nm->mkConst(String("a"))); + Node c = d_nm->mkNode(STRING_TO_REGEXP, d_nm->mkConst(String("c"))); + Node abc = d_nm->mkNode(STRING_TO_REGEXP, d_nm->mkConst(String("abc"))); + Node sigma3 = d_nm->mkNode(REGEXP_CONCAT, sigma, sigma, sigma); + Node asc = d_nm->mkNode(REGEXP_CONCAT, a, sigma, c); + Node asw = d_nm->mkNode(REGEXP_CONCAT, a, sigma, sigmaStar); + + includes(sigma3, abc); + doesNotInclude(abc, sigma3); + + includes(sigma3, asc); + doesNotInclude(asc, sigma3); + + includes(asc, abc); + doesNotInclude(abc, asc); + + includes(asw, asc); + doesNotInclude(asc, asw); + } + + void testStarWildcards() + { + Node sigma = d_nm->mkNode(REGEXP_SIGMA, std::vector{}); + Node sigmaStar = d_nm->mkNode(REGEXP_STAR, sigma); + Node a = d_nm->mkNode(STRING_TO_REGEXP, d_nm->mkConst(String("a"))); + Node c = d_nm->mkNode(STRING_TO_REGEXP, d_nm->mkConst(String("c"))); + Node abc = d_nm->mkNode(STRING_TO_REGEXP, d_nm->mkConst(String("abc"))); + + Node _abc_ = d_nm->mkNode(REGEXP_CONCAT, sigmaStar, abc, sigmaStar); + Node _asc_ = d_nm->mkNode(REGEXP_CONCAT, sigmaStar, a, sigma, c, sigmaStar); + Node _sc_ = Rewriter::rewrite( + d_nm->mkNode(REGEXP_CONCAT, sigmaStar, sigma, c, sigmaStar)); + Node _as_ = Rewriter::rewrite( + d_nm->mkNode(REGEXP_CONCAT, sigmaStar, a, sigma, sigmaStar)); + Node _assc_ = d_nm->mkNode( + REGEXP_CONCAT, + std::vector{sigmaStar, a, sigma, sigma, c, sigmaStar}); + Node _csa_ = d_nm->mkNode(REGEXP_CONCAT, sigmaStar, c, sigma, a, sigmaStar); + Node _c_a_ = + d_nm->mkNode(REGEXP_CONCAT, sigmaStar, c, sigmaStar, a, sigmaStar); + Node _s_s_ = Rewriter::rewrite(d_nm->mkNode( + REGEXP_CONCAT, sigmaStar, sigma, sigmaStar, sigma, sigmaStar)); + Node _a_abc_ = Rewriter::rewrite( + d_nm->mkNode(REGEXP_CONCAT, sigmaStar, a, sigmaStar, abc, sigmaStar)); + + includes(_asc_, _abc_); + doesNotInclude(_abc_, _asc_); + doesNotInclude(_csa_, _abc_); + doesNotInclude(_assc_, _asc_); + doesNotInclude(_asc_, _assc_); + includes(_sc_, abc); + includes(_sc_, _abc_); + includes(_sc_, _asc_); + includes(_sc_, _assc_); + doesNotInclude(_sc_, _csa_); + includes(_as_, abc); + includes(_as_, _abc_); + includes(_as_, _asc_); + includes(_as_, _assc_); + doesNotInclude(_sc_, _csa_); + includes(_s_s_, _c_a_); + doesNotInclude(_c_a_, _s_s_); + includes(_abc_, _a_abc_); + doesNotInclude(_a_abc_, _abc_); + } + + private: + ExprManager* d_em; + SmtEngine* d_smt; + SmtScope* d_scope; + RegExpOpr* d_regExpOpr; + + NodeManager* d_nm; +};