From eacb636406e609299b6e5b64e93f1cf5b73f4ba3 Mon Sep 17 00:00:00 2001 From: Andrew Reynolds Date: Wed, 15 Apr 2020 09:00:47 -0500 Subject: [PATCH] Move regular expression inclusion test to RegExpEntail (#4310) In preparation for rephrasing this inference as a rewrite. --- src/theory/strings/regexp_entail.cpp | 103 ++++++++++++++++++++++++ src/theory/strings/regexp_entail.h | 15 ++++ src/theory/strings/regexp_operation.cpp | 99 +---------------------- src/theory/strings/regexp_operation.h | 12 +-- 4 files changed, 122 insertions(+), 107 deletions(-) diff --git a/src/theory/strings/regexp_entail.cpp b/src/theory/strings/regexp_entail.cpp index a43ec4430..8830b7f93 100644 --- a/src/theory/strings/regexp_entail.cpp +++ b/src/theory/strings/regexp_entail.cpp @@ -620,6 +620,109 @@ Node RegExpEntail::getFixedLengthForRegexp(Node n) return Node::null(); } +bool RegExpEntail::regExpIncludes(Node r1, Node r2) +{ + Assert(Rewriter::rewrite(r1) == r1); + Assert(Rewriter::rewrite(r2) == r2); + if (r1 == r2) + { + return true; + } + + // This method only works on a fragment of regular expressions + if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2)) + { + return false; + } + NodeManager* nm = NodeManager::currentNM(); + Node sigma = nm->mkNode(REGEXP_SIGMA, std::vector{}); + Node sigmaStar = nm->mkNode(REGEXP_STAR, sigma); + + std::vector v1, v2; + utils::getRegexpComponents(r1, v1); + utils::getRegexpComponents(r2, v2); + + // In the following, we iterate over `r2` (the "includee") and try to + // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible + // positions in `r1` that we could currently be at. + std::unordered_set newIdxs = {0}; + std::unordered_set idxs; + for (const Node& n2 : v2) + { + // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are + // removed and for (re.* re.allchar), we additionally include the option of + // skipping it. Indices must be smaller than the size of `v1`. + idxs.clear(); + for (size_t idx : newIdxs) + { + if (idx < v1.size()) + { + idxs.insert(idx); + if (idx + 1 < v1.size() && v1[idx] == sigmaStar) + { + Assert(idx + 1 == v1.size() || v1[idx + 1] != sigmaStar); + idxs.insert(idx + 1); + } + } + } + newIdxs.clear(); + + if (n2.getKind() == STRING_TO_REGEXP || n2 == sigma) + { + Assert(n2 == sigma + || (n2[0].isConst() && n2[0].getConst().size() == 1)); + for (size_t idx : idxs) + { + if (v1[idx] == sigma || v1[idx] == n2) + { + // Given a character or an re.allchar in `r2`, we can either + // match it with a corresponding character in `r1` or an + // re.allchar + newIdxs.insert(idx + 1); + } + } + } + + for (size_t idx : idxs) + { + if (v1[idx] == sigmaStar) + { + // (re.* re.allchar) can match an arbitrary amount of `r2` + newIdxs.insert(idx); + } + else if (utils::isUnboundedWildcard(v1, idx)) + { + // If a series of re.allchar is followed by (re.* re.allchar), we + // can decide not to "waste" the re.allchar because the order of + // the two wildcards is not observable (i.e. it does not change + // the sequences matched by the regular expression) + newIdxs.insert(idx); + } + } + + if (newIdxs.empty()) + { + // If there are no candidates, we can't match the remainder of r2 + return false; + } + } + + // We have processed all of `r2`. We are now looking if there was also a + // path to the end in `r1`. This makes sure that we don't have leftover + // bits in `r1` that don't match anything in `r2`. + bool result = false; + for (size_t idx : newIdxs) + { + if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == sigmaStar)) + { + result = true; + break; + } + } + + return result; +} + } // namespace strings } // namespace theory } // namespace CVC4 diff --git a/src/theory/strings/regexp_entail.h b/src/theory/strings/regexp_entail.h index 0732a6770..ef3f2d34e 100644 --- a/src/theory/strings/regexp_entail.h +++ b/src/theory/strings/regexp_entail.h @@ -108,6 +108,21 @@ class RegExpEntail * x in n entails len( x ) = c. */ static Node getFixedLengthForRegexp(Node n); + + /** + * Returns true if we can show that the regular expression `r1` includes + * the regular expression `r2` (i.e. `r1` matches a superset of sequences + * that `r2` matches). This method only works on a fragment of regular + * expressions, specifically regular expressions that pass the + * `isSimpleRegExp` check. + * + * @param r1 The regular expression that may include `r2` (must be in + * rewritten form) + * @param r2 The regular expression that may be included by `r1` (must be + * in rewritten form) + * @return True if the inclusion can be shown, false otherwise + */ + static bool regExpIncludes(Node r1, Node r2); }; } // namespace strings diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp index 0d6b6c7fe..b99124ac3 100644 --- a/src/theory/strings/regexp_operation.cpp +++ b/src/theory/strings/regexp_operation.cpp @@ -1750,109 +1750,12 @@ std::string RegExpOpr::mkString( Node r ) { bool RegExpOpr::regExpIncludes(Node r1, Node r2) { - Assert(Rewriter::rewrite(r1) == r1); - Assert(Rewriter::rewrite(r2) == r2); - - if (r1 == r2) - { - return true; - } - - // This method only works on a fragment of regular expressions - if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2)) - { - return false; - } - const auto& it = d_inclusionCache.find(std::make_pair(r1, r2)); if (it != d_inclusionCache.end()) { return (*it).second; } - - std::vector v1, v2; - utils::getRegexpComponents(r1, v1); - utils::getRegexpComponents(r2, v2); - - // In the following, we iterate over `r2` (the "includee") and try to - // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible - // positions in `r1` that we could currently be at. - std::unordered_set newIdxs = {0}; - std::unordered_set idxs; - for (const Node& n2 : v2) - { - // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are - // removed and for (re.* re.allchar), we additionally include the option of - // skipping it. Indices must be smaller than the size of `v1`. - idxs.clear(); - for (size_t idx : newIdxs) - { - if (idx < v1.size()) - { - idxs.insert(idx); - if (idx + 1 < v1.size() && v1[idx] == d_sigma_star) - { - Assert(idx + 1 == v1.size() || v1[idx + 1] != d_sigma_star); - idxs.insert(idx + 1); - } - } - } - newIdxs.clear(); - - if (n2.getKind() == STRING_TO_REGEXP || n2 == d_sigma) - { - Assert(n2 == d_sigma - || (n2[0].isConst() && n2[0].getConst().size() == 1)); - for (size_t idx : idxs) - { - if (v1[idx] == d_sigma || v1[idx] == n2) - { - // Given a character or an re.allchar in `r2`, we can either - // match it with a corresponding character in `r1` or an - // re.allchar - newIdxs.insert(idx + 1); - } - } - } - - for (size_t idx : idxs) - { - if (v1[idx] == d_sigma_star) - { - // (re.* re.allchar) can match an arbitrary amount of `r2` - newIdxs.insert(idx); - } - else if (utils::isUnboundedWildcard(v1, idx)) - { - // If a series of re.allchar is followed by (re.* re.allchar), we - // can decide not to "waste" the re.allchar because the order of - // the two wildcards is not observable (i.e. it does not change - // the sequences matched by the regular expression) - newIdxs.insert(idx); - } - } - - if (newIdxs.empty()) - { - // If there are no candidates, we can't match the remainder of r2 - d_inclusionCache[std::make_pair(r1, r2)] = false; - return false; - } - } - - // We have processed all of `r2`. We are now looking if there was also a - // path to the end in `r1`. This makes sure that we don't have leftover - // bits in `r1` that don't match anything in `r2`. - bool result = false; - for (size_t idx : newIdxs) - { - if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == d_sigma_star)) - { - result = true; - break; - } - } - + bool result = RegExpEntail::regExpIncludes(r1, r2); d_inclusionCache[std::make_pair(r1, r2)] = result; return result; } diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h index 53d60cd40..689c35f3d 100644 --- a/src/theory/strings/regexp_operation.h +++ b/src/theory/strings/regexp_operation.h @@ -150,15 +150,9 @@ class RegExpOpr { /** * Returns true if we can show that the regular expression `r1` includes * the regular expression `r2` (i.e. `r1` matches a superset of sequences - * that `r2` matches). This method only works on a fragment of regular - * expressions, specifically regular expressions that pass the - * `isSimpleRegExp` check. - * - * @param r1 The regular expression that may include `r2` (must be in - * rewritten form) - * @param r2 The regular expression that may be included by `r1` (must be - * in rewritten form) - * @return True if the inclusion can be shown, false otherwise + * that `r2` matches). See documentation in RegExpEntail::regExpIncludes for + * more details. This call caches the result (which is context-independent), + * for performance reasons. */ bool regExpIncludes(Node r1, Node r2); }; -- 2.30.2