From eacb636406e609299b6e5b64e93f1cf5b73f4ba3 Mon Sep 17 00:00:00 2001
From: Andrew Reynolds <andrew.j.reynolds@gmail.com>
Date: Wed, 15 Apr 2020 09:00:47 -0500
Subject: [PATCH] Move regular expression inclusion test to RegExpEntail
 (#4310)

In preparation for rephrasing this inference as a rewrite.
---
 src/theory/strings/regexp_entail.cpp    | 103 ++++++++++++++++++++++++
 src/theory/strings/regexp_entail.h      |  15 ++++
 src/theory/strings/regexp_operation.cpp |  99 +----------------------
 src/theory/strings/regexp_operation.h   |  12 +--
 4 files changed, 122 insertions(+), 107 deletions(-)
diff --git a/src/theory/strings/regexp_entail.cpp b/src/theory/strings/regexp_entail.cpp
index a43ec4430..8830b7f93 100644
--- a/src/theory/strings/regexp_entail.cpp
+++ b/src/theory/strings/regexp_entail.cpp
@@ -620,6 +620,109 @@ Node RegExpEntail::getFixedLengthForRegexp(Node n)
   return Node::null();
 }
 
+bool RegExpEntail::regExpIncludes(Node r1, Node r2)
+{
+  Assert(Rewriter::rewrite(r1) == r1);
+  Assert(Rewriter::rewrite(r2) == r2);
+  if (r1 == r2)
+  {
+    return true;
+  }
+
+  // This method only works on a fragment of regular expressions
+  if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2))
+  {
+    return false;
+  }
+  NodeManager* nm = NodeManager::currentNM();
+  Node sigma = nm->mkNode(REGEXP_SIGMA, std::vector<Node>{});
+  Node sigmaStar = nm->mkNode(REGEXP_STAR, sigma);
+
+  std::vector<Node> v1, v2;
+  utils::getRegexpComponents(r1, v1);
+  utils::getRegexpComponents(r2, v2);
+
+  // In the following, we iterate over `r2` (the "includee") and try to
+  // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible
+  // positions in `r1` that we could currently be at.
+  std::unordered_set<size_t> newIdxs = {0};
+  std::unordered_set<size_t> idxs;
+  for (const Node& n2 : v2)
+  {
+    // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are
+    // removed and for (re.* re.allchar), we additionally include the option of
+    // skipping it. Indices must be smaller than the size of `v1`.
+    idxs.clear();
+    for (size_t idx : newIdxs)
+    {
+      if (idx < v1.size())
+      {
+        idxs.insert(idx);
+        if (idx + 1 < v1.size() && v1[idx] == sigmaStar)
+        {
+          Assert(idx + 1 == v1.size() || v1[idx + 1] != sigmaStar);
+          idxs.insert(idx + 1);
+        }
+      }
+    }
+    newIdxs.clear();
+
+    if (n2.getKind() == STRING_TO_REGEXP || n2 == sigma)
+    {
+      Assert(n2 == sigma
+             || (n2[0].isConst() && n2[0].getConst<String>().size() == 1));
+      for (size_t idx : idxs)
+      {
+        if (v1[idx] == sigma || v1[idx] == n2)
+        {
+          // Given a character or an re.allchar in `r2`, we can either
+          // match it with a corresponding character in `r1` or an
+          // re.allchar
+          newIdxs.insert(idx + 1);
+        }
+      }
+    }
+
+    for (size_t idx : idxs)
+    {
+      if (v1[idx] == sigmaStar)
+      {
+        // (re.* re.allchar) can match an arbitrary amount of `r2`
+        newIdxs.insert(idx);
+      }
+      else if (utils::isUnboundedWildcard(v1, idx))
+      {
+        // If a series of re.allchar is followed by (re.* re.allchar), we
+        // can decide not to "waste" the re.allchar because the order of
+        // the two wildcards is not observable (i.e. it does not change
+        // the sequences matched by the regular expression)
+        newIdxs.insert(idx);
+      }
+    }
+
+    if (newIdxs.empty())
+    {
+      // If there are no candidates, we can't match the remainder of r2
+      return false;
+    }
+  }
+
+  // We have processed all of `r2`. We are now looking if there was also a
+  // path to the end in `r1`. This makes sure that we don't have leftover
+  // bits in `r1` that don't match anything in `r2`.
+  bool result = false;
+  for (size_t idx : newIdxs)
+  {
+    if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == sigmaStar))
+    {
+      result = true;
+      break;
+    }
+  }
+
+  return result;
+}
+
 }  // namespace strings
 }  // namespace theory
 }  // namespace CVC4
diff --git a/src/theory/strings/regexp_entail.h b/src/theory/strings/regexp_entail.h
index 0732a6770..ef3f2d34e 100644
--- a/src/theory/strings/regexp_entail.h
+++ b/src/theory/strings/regexp_entail.h
@@ -108,6 +108,21 @@ class RegExpEntail
    * x in n entails len( x ) = c.
    */
   static Node getFixedLengthForRegexp(Node n);
+
+  /**
+   * Returns true if we can show that the regular expression `r1` includes
+   * the regular expression `r2` (i.e. `r1` matches a superset of sequences
+   * that `r2` matches). This method only works on a fragment of regular
+   * expressions, specifically regular expressions that pass the
+   * `isSimpleRegExp` check.
+   *
+   * @param r1 The regular expression that may include `r2` (must be in
+   *           rewritten form)
+   * @param r2 The regular expression that may be included by `r1` (must be
+   *           in rewritten form)
+   * @return True if the inclusion can be shown, false otherwise
+   */
+  static bool regExpIncludes(Node r1, Node r2);
 };
 
 }  // namespace strings
diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp
index 0d6b6c7fe..b99124ac3 100644
--- a/src/theory/strings/regexp_operation.cpp
+++ b/src/theory/strings/regexp_operation.cpp
@@ -1750,109 +1750,12 @@ std::string RegExpOpr::mkString( Node r ) {
 
 bool RegExpOpr::regExpIncludes(Node r1, Node r2)
 {
-  Assert(Rewriter::rewrite(r1) == r1);
-  Assert(Rewriter::rewrite(r2) == r2);
-
-  if (r1 == r2)
-  {
-    return true;
-  }
-
-  // This method only works on a fragment of regular expressions
-  if (!utils::isSimpleRegExp(r1) || !utils::isSimpleRegExp(r2))
-  {
-    return false;
-  }
-
   const auto& it = d_inclusionCache.find(std::make_pair(r1, r2));
   if (it != d_inclusionCache.end())
   {
     return (*it).second;
   }
-
-  std::vector<Node> v1, v2;
-  utils::getRegexpComponents(r1, v1);
-  utils::getRegexpComponents(r2, v2);
-
-  // In the following, we iterate over `r2` (the "includee") and try to
-  // match it with `r1`. `idxs`/`newIdxs` keep track of all the possible
-  // positions in `r1` that we could currently be at.
-  std::unordered_set<size_t> newIdxs = {0};
-  std::unordered_set<size_t> idxs;
-  for (const Node& n2 : v2)
-  {
-    // Transfer elements from `newIdxs` to `idxs`. Out-of-bound indices are
-    // removed and for (re.* re.allchar), we additionally include the option of
-    // skipping it. Indices must be smaller than the size of `v1`.
-    idxs.clear();
-    for (size_t idx : newIdxs)
-    {
-      if (idx < v1.size())
-      {
-        idxs.insert(idx);
-        if (idx + 1 < v1.size() && v1[idx] == d_sigma_star)
-        {
-          Assert(idx + 1 == v1.size() || v1[idx + 1] != d_sigma_star);
-          idxs.insert(idx + 1);
-        }
-      }
-    }
-    newIdxs.clear();
-
-    if (n2.getKind() == STRING_TO_REGEXP || n2 == d_sigma)
-    {
-      Assert(n2 == d_sigma
-             || (n2[0].isConst() && n2[0].getConst<String>().size() == 1));
-      for (size_t idx : idxs)
-      {
-        if (v1[idx] == d_sigma || v1[idx] == n2)
-        {
-          // Given a character or an re.allchar in `r2`, we can either
-          // match it with a corresponding character in `r1` or an
-          // re.allchar
-          newIdxs.insert(idx + 1);
-        }
-      }
-    }
-
-    for (size_t idx : idxs)
-    {
-      if (v1[idx] == d_sigma_star)
-      {
-        // (re.* re.allchar) can match an arbitrary amount of `r2`
-        newIdxs.insert(idx);
-      }
-      else if (utils::isUnboundedWildcard(v1, idx))
-      {
-        // If a series of re.allchar is followed by (re.* re.allchar), we
-        // can decide not to "waste" the re.allchar because the order of
-        // the two wildcards is not observable (i.e. it does not change
-        // the sequences matched by the regular expression)
-        newIdxs.insert(idx);
-      }
-    }
-
-    if (newIdxs.empty())
-    {
-      // If there are no candidates, we can't match the remainder of r2
-      d_inclusionCache[std::make_pair(r1, r2)] = false;
-      return false;
-    }
-  }
-
-  // We have processed all of `r2`. We are now looking if there was also a
-  // path to the end in `r1`. This makes sure that we don't have leftover
-  // bits in `r1` that don't match anything in `r2`.
-  bool result = false;
-  for (size_t idx : newIdxs)
-  {
-    if (idx == v1.size() || (idx == v1.size() - 1 && v1[idx] == d_sigma_star))
-    {
-      result = true;
-      break;
-    }
-  }
-
+  bool result = RegExpEntail::regExpIncludes(r1, r2);
   d_inclusionCache[std::make_pair(r1, r2)] = result;
   return result;
 }
diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h
index 53d60cd40..689c35f3d 100644
--- a/src/theory/strings/regexp_operation.h
+++ b/src/theory/strings/regexp_operation.h
@@ -150,15 +150,9 @@ class RegExpOpr {
   /**
    * Returns true if we can show that the regular expression `r1` includes
    * the regular expression `r2` (i.e. `r1` matches a superset of sequences
-   * that `r2` matches). This method only works on a fragment of regular
-   * expressions, specifically regular expressions that pass the
-   * `isSimpleRegExp` check.
-   *
-   * @param r1 The regular expression that may include `r2` (must be in
-   *           rewritten form)
-   * @param r2 The regular expression that may be included by `r1` (must be
-   *           in rewritten form)
-   * @return True if the inclusion can be shown, false otherwise
+   * that `r2` matches). See documentation in RegExpEntail::regExpIncludes for
+   * more details. This call caches the result (which is context-independent),
+   * for performance reasons.
    */
   bool regExpIncludes(Node r1, Node r2);
 };
-- 
2.30.2