From b0fa6b29a1e15b231547eab147b49f2883a139de Mon Sep 17 00:00:00 2001
From: Andrew Reynolds <andrew.j.reynolds@gmail.com>
Date: Mon, 24 Feb 2020 16:21:45 -0600
Subject: [PATCH] Utilities for words (#3797)

This adds a utility file for handling operations over constant "words" (this will eventually generalize string or sequence constants). This is work towards CVC4/cvc4-projects#23.

Also improves documentation in regexp.h.
---
 src/CMakeLists.txt                           |   2 +
 src/theory/strings/word.cpp                  | 227 +++++++++++++++++++
 src/theory/strings/word.h                    | 121 ++++++++++
 src/util/regexp.cpp                          |   7 +
 src/util/regexp.h                            |  25 +-
 test/unit/theory/CMakeLists.txt              |   1 +
 test/unit/theory/theory_strings_word_white.h | 149 ++++++++++++
 7 files changed, 523 insertions(+), 9 deletions(-)
 create mode 100644 src/theory/strings/word.cpp
 create mode 100644 src/theory/strings/word.h
 create mode 100644 test/unit/theory/theory_strings_word_white.h
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 45912a30d..9a81ccc63 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -702,6 +702,8 @@ libcvc4_add_sources(
   theory/strings/theory_strings_utils.cpp
   theory/strings/theory_strings_utils.h
   theory/strings/type_enumerator.h
+  theory/strings/word.cpp
+  theory/strings/word.h
   theory/subs_minimize.cpp
   theory/subs_minimize.h
   theory/substitutions.cpp
diff --git a/src/theory/strings/word.cpp b/src/theory/strings/word.cpp
new file mode 100644
index 000000000..655df7da3
--- /dev/null
+++ b/src/theory/strings/word.cpp
@@ -0,0 +1,227 @@
+/*********************                                                        */
+/*! \file word.cpp
+ ** \verbatim
+ ** Top contributors (to current version):
+ **   Andrew Reynolds
+ ** This file is part of the CVC4 project.
+ ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
+ ** in the top-level source directory) and their institutional affiliations.
+ ** All rights reserved.  See the file COPYING in the top-level source
+ ** directory for licensing information.\endverbatim
+ **
+ ** \brief Implementation of utility functions for words.
+ **/
+
+#include "theory/strings/word.h"
+
+#include "util/regexp.h"
+
+using namespace CVC4::kind;
+
+namespace CVC4 {
+namespace theory {
+namespace strings {
+
+Node Word::mkEmptyWord(Kind k)
+{
+  NodeManager* nm = NodeManager::currentNM();
+  if (k == CONST_STRING)
+  {
+    std::vector<unsigned> vec;
+    return nm->mkConst(String(vec));
+  }
+  Unimplemented();
+  return Node::null();
+}
+
+Node Word::mkWord(const std::vector<Node>& xs)
+{
+  Assert(!xs.empty());
+  NodeManager* nm = NodeManager::currentNM();
+  Kind k = xs[0].getKind();
+  if (k == CONST_STRING)
+  {
+    std::vector<unsigned> vec;
+    for (TNode x : xs)
+    {
+      Assert(x.getKind() == CONST_STRING);
+      String sx = x.getConst<String>();
+      const std::vector<unsigned>& vecc = sx.getVec();
+      vec.insert(vec.end(), vecc.begin(), vecc.end());
+    }
+    return nm->mkConst(String(vec));
+  }
+  Unimplemented();
+  return Node::null();
+}
+
+size_t Word::getLength(TNode x)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    return x.getConst<String>().size();
+  }
+  Unimplemented();
+  return 0;
+}
+
+bool Word::isEmpty(TNode x) { return getLength(x) == 0; }
+
+std::size_t Word::find(TNode x, TNode y, std::size_t start)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    return sx.find(sy, start);
+  }
+  Unimplemented();
+  return 0;
+}
+
+std::size_t Word::rfind(TNode x, TNode y, std::size_t start)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    return sx.rfind(sy, start);
+  }
+  Unimplemented();
+  return 0;
+}
+
+bool Word::hasPrefix(TNode x, TNode y)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    return sx.hasPrefix(sy);
+  }
+  Unimplemented();
+  return false;
+}
+
+bool Word::hasSuffix(TNode x, TNode y)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    return sx.hasSuffix(sy);
+  }
+  Unimplemented();
+  return false;
+}
+
+Node Word::replace(TNode x, TNode y, TNode t)
+{
+  NodeManager* nm = NodeManager::currentNM();
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    Assert(t.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    String st = t.getConst<String>();
+    return nm->mkConst(String(sx.replace(sy, st)));
+  }
+  Unimplemented();
+  return Node::null();
+}
+Node Word::substr(TNode x, std::size_t i)
+{
+  NodeManager* nm = NodeManager::currentNM();
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    String sx = x.getConst<String>();
+    return nm->mkConst(String(sx.substr(i)));
+  }
+  Unimplemented();
+  return Node::null();
+}
+Node Word::substr(TNode x, std::size_t i, std::size_t j)
+{
+  NodeManager* nm = NodeManager::currentNM();
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    String sx = x.getConst<String>();
+    return nm->mkConst(String(sx.substr(i, j)));
+  }
+  Unimplemented();
+  return Node::null();
+}
+
+Node Word::prefix(TNode x, std::size_t i) { return substr(x, 0, i); }
+
+Node Word::suffix(TNode x, std::size_t i)
+{
+  NodeManager* nm = NodeManager::currentNM();
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    String sx = x.getConst<String>();
+    return nm->mkConst(String(sx.suffix(i)));
+  }
+  Unimplemented();
+  return Node::null();
+}
+
+bool Word::noOverlapWith(TNode x, TNode y)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    return sx.noOverlapWith(sy);
+  }
+  Unimplemented();
+  return false;
+}
+
+std::size_t Word::overlap(TNode x, TNode y)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    return sx.overlap(sy);
+  }
+  Unimplemented();
+  return 0;
+}
+
+std::size_t Word::roverlap(TNode x, TNode y)
+{
+  Kind k = x.getKind();
+  if (k == CONST_STRING)
+  {
+    Assert(y.getKind() == CONST_STRING);
+    String sx = x.getConst<String>();
+    String sy = y.getConst<String>();
+    return sx.roverlap(sy);
+  }
+  Unimplemented();
+  return 0;
+}
+
+}  // namespace strings
+}  // namespace theory
+}  // namespace CVC4
diff --git a/src/theory/strings/word.h b/src/theory/strings/word.h
new file mode 100644
index 000000000..e4d10247a
--- /dev/null
+++ b/src/theory/strings/word.h
@@ -0,0 +1,121 @@
+/*********************                                                        */
+/*! \file word.h
+ ** \verbatim
+ ** Top contributors (to current version):
+ **   Andrew Reynolds
+ ** This file is part of the CVC4 project.
+ ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
+ ** in the top-level source directory) and their institutional affiliations.
+ ** All rights reserved.  See the file COPYING in the top-level source
+ ** directory for licensing information.\endverbatim
+ **
+ ** \brief Utility functions for words.
+ **/
+
+#include "cvc4_private.h"
+
+#ifndef CVC4__THEORY__STRINGS__WORD_H
+#define CVC4__THEORY__STRINGS__WORD_H
+
+#include <vector>
+
+#include "expr/node.h"
+
+namespace CVC4 {
+namespace theory {
+namespace strings {
+
+// ------------------------------ for words (string or sequence constants)
+class Word
+{
+public:
+  /** make empty constant of kind k */
+  static Node mkEmptyWord(Kind k);
+
+  /** make word from constants in (non-empty) vector vec */
+  static Node mkWord(const std::vector<Node>& xs);
+
+  /** Return the length of word x */
+  static size_t getLength(TNode x);
+
+  /** Return true if x is empty */
+  static bool isEmpty(TNode x);
+
+  /** Return the first position y occurs in x, or std::string::npos otherwise */
+  static std::size_t find(TNode x, TNode y, std::size_t start = 0);
+
+  /**
+  * Return the first position y occurs in x searching from the end of x, or
+  * std::string::npos otherwise
+  */
+  static std::size_t rfind(TNode x, TNode y, std::size_t start = 0);
+
+  /** Returns true if y is a prefix of x */
+  static bool hasPrefix(TNode x, TNode y);
+
+  /** Returns true if y is a suffix of x */
+  static bool hasSuffix(TNode x, TNode y);
+
+  /** Replace the first occurrence of y in x with t */
+  static Node replace(TNode x, TNode y, TNode t);
+
+  /** Return the substring/subsequence of x starting at index i */
+  static Node substr(TNode x, std::size_t i);
+
+  /** Return the substring/subsequence of x starting at index i with size at most
+  * j */
+  static Node substr(TNode x, std::size_t i, std::size_t j);
+
+  /** Return the prefix of x of size at most i */
+  static Node prefix(TNode x, std::size_t i);
+
+  /** Return the suffix of x of size at most i */
+  static Node suffix(TNode x, std::size_t i);
+
+  /**
+  * Checks if there is any overlap between word x and another word y. This
+  * corresponds to checking whether one string contains the other and whether a
+  * substring/subsequence of one is a prefix of the other and vice-versa.
+  *
+  * @param x The first string
+  * @param y The second string
+  * @return True if there is an overlap, false otherwise
+  */
+  static bool noOverlapWith(TNode x, TNode y);
+
+  /** overlap
+  *
+  * if overlap returns m>0,
+  * then the maximal suffix of this string that is a prefix of y is of length m.
+  *
+  * For example, if x is "abcdef", then:
+  * x.overlap("defg") = 3
+  * x.overlap("ab") = 0
+  * x.overlap("d") = 0
+  * x.overlap("bcdefdef") = 5
+  */
+  static std::size_t overlap(TNode x, TNode y);
+
+  /** reverse overlap
+  *
+  * if roverlap returns m>0,
+  * then the maximal prefix of this word that is a suffix of y is of length m.
+  *
+  * For example, if x is "abcdef", then:
+  * x.roverlap("aaabc") = 3
+  * x.roverlap("def") = 0
+  * x.roverlap("d") = 0
+  * x.roverlap("defabcde") = 5
+  *
+  * Notice that x.overlap(y) = y.roverlap(x)
+  */
+  static std::size_t roverlap(TNode x, TNode y);
+};
+
+// ------------------------------ end for words (string or sequence constants)
+
+}  // namespace strings
+}  // namespace theory
+}  // namespace CVC4
+
+#endif
diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp
index ee63308ab..4dfe68b51 100644
--- a/src/util/regexp.cpp
+++ b/src/util/regexp.cpp
@@ -462,6 +462,13 @@ String String::substr(std::size_t i, std::size_t j) const {
   return String(ret_vec);
 }
 
+bool String::noOverlapWith(const String& y) const
+{
+  return y.find(*this) == std::string::npos
+         && this->find(y) == std::string::npos && this->overlap(y) == 0
+         && y.overlap(*this) == 0;
+}
+
 bool String::isNumber() const {
   if (d_str.empty()) {
     return false;
diff --git a/src/util/regexp.h b/src/util/regexp.h
index 5b6dc2b62..1cde53687 100644
--- a/src/util/regexp.h
+++ b/src/util/regexp.h
@@ -140,34 +140,41 @@ class CVC4_PUBLIC String {
   bool isRepeated() const;
   bool tailcmp(const String& y, int& c) const;
 
+  /**
+   * Return the first position y occurs in this string, or std::string::npos
+   * otherwise.
+   */
   std::size_t find(const String& y, const std::size_t start = 0) const;
+  /**
+   * Return the first position y occurs in this string searching from the end,
+   * or std::string::npos otherwise.
+   */
   std::size_t rfind(const String& y, const std::size_t start = 0) const;
   /** Returns true if y is a prefix of this */
   bool hasPrefix(const String& y) const;
   /** Returns true if y is a suffix of this */
   bool hasSuffix(const String& y) const;
-
+  /** Replace the first occurrence of s in this string with t */
   String replace(const String& s, const String& t) const;
+  /** Return the substring of this string starting at index i */
   String substr(std::size_t i) const;
+  /** Return the substring of this string starting at index i with size at most
+   * j */
   String substr(std::size_t i, std::size_t j) const;
-
+  /** Return the prefix of this string of size at most i */
   String prefix(std::size_t i) const { return substr(0, i); }
+  /** Return the suffix of this string of size at most i */
   String suffix(std::size_t i) const { return substr(size() - i, i); }
 
   /**
    * Checks if there is any overlap between this string and another string. This
-   * corresponds to checking whether one string contains the other and wether a
+   * corresponds to checking whether one string contains the other and whether a
    * substring of one is a prefix of the other and vice-versa.
    *
    * @param y The other string
    * @return True if there is an overlap, false otherwise
    */
-  bool noOverlapWith(const String& y) const
-  {
-    return y.find(*this) == std::string::npos
-           && this->find(y) == std::string::npos && this->overlap(y) == 0
-           && y.overlap(*this) == 0;
-  }
+  bool noOverlapWith(const String& y) const;
 
   /** string overlap
   *
diff --git a/test/unit/theory/CMakeLists.txt b/test/unit/theory/CMakeLists.txt
index 02ca5d8b5..f20088ad1 100644
--- a/test/unit/theory/CMakeLists.txt
+++ b/test/unit/theory/CMakeLists.txt
@@ -10,5 +10,6 @@ cvc4_add_unit_test_white(theory_quantifiers_bv_instantiator_white theory)
 cvc4_add_unit_test_white(theory_quantifiers_bv_inverter_white theory)
 cvc4_add_unit_test_white(theory_strings_rewriter_white theory)
 cvc4_add_unit_test_white(theory_strings_skolem_cache_black theory)
+cvc4_add_unit_test_white(theory_strings_word_white theory)
 cvc4_add_unit_test_white(theory_white theory)
 cvc4_add_unit_test_white(type_enumerator_white theory)
diff --git a/test/unit/theory/theory_strings_word_white.h b/test/unit/theory/theory_strings_word_white.h
new file mode 100644
index 000000000..d84df7836
--- /dev/null
+++ b/test/unit/theory/theory_strings_word_white.h
@@ -0,0 +1,149 @@
+/*********************                                                        */
+/*! \file theory_strings_word_white.h
+ ** \verbatim
+ ** Top contributors (to current version):
+ **   Andrew Reynolds
+ ** This file is part of the CVC4 project.
+ ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
+ ** in the top-level source directory) and their institutional affiliations.
+ ** All rights reserved.  See the file COPYING in the top-level source
+ ** directory for licensing information.\endverbatim
+ **
+ ** \brief Unit tests for the strings word utilities
+ **/
+
+#include "expr/node.h"
+#include "expr/node_manager.h"
+#include "smt/smt_engine.h"
+#include "smt/smt_engine_scope.h"
+#include "theory/strings/word.h"
+
+#include <cxxtest/TestSuite.h>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+using namespace CVC4;
+using namespace CVC4::smt;
+using namespace CVC4::theory;
+using namespace CVC4::theory::strings;
+
+class TheoryStringsWordWhite : public CxxTest::TestSuite
+{
+ public:
+  TheoryStringsWordWhite() {}
+
+  void setUp() override
+  {
+    Options opts;
+    opts.setOutputLanguage(language::output::LANG_SMTLIB_V2);
+    d_em = new ExprManager(opts);
+    d_smt = new SmtEngine(d_em);
+    d_scope = new SmtScope(d_smt);
+
+    d_nm = NodeManager::currentNM();
+  }
+
+  void tearDown() override
+  {
+    delete d_scope;
+    delete d_smt;
+    delete d_em;
+  }
+
+  void testStrings()
+  {
+    Node empty = d_nm->mkConst(String(""));
+    Node a = d_nm->mkConst(String("a"));
+    Node b = d_nm->mkConst(String("b"));
+    Node aa = d_nm->mkConst(String("aa"));
+    Node aaaaa = d_nm->mkConst(String("aaaaa"));
+    Node abc = d_nm->mkConst(String("abc"));
+    Node bbc = d_nm->mkConst(String("bbc"));
+    Node cac = d_nm->mkConst(String("cac"));
+    Node abca = d_nm->mkConst(String("abca"));
+
+    TS_ASSERT(Word::mkEmptyWord(kind::CONST_STRING) == empty);
+
+    std::vector<Node> vec;
+    vec.push_back(abc);
+    Node abcMk = Word::mkWord(vec);
+    TS_ASSERT_EQUALS(abc, abcMk);
+    vec.push_back(a);
+    Node abcaMk = Word::mkWord(vec);
+    TS_ASSERT_EQUALS(abca, abcaMk);
+
+    TS_ASSERT(Word::getLength(empty) == 0);
+    TS_ASSERT(Word::getLength(aaaaa) == 5);
+
+    TS_ASSERT(Word::isEmpty(empty));
+    TS_ASSERT(!Word::isEmpty(a));
+
+    TS_ASSERT(Word::find(empty, empty) == 0);
+    TS_ASSERT(Word::find(a, empty) == 0);
+    TS_ASSERT(Word::find(empty, empty, 1) == std::string::npos);
+    TS_ASSERT(Word::find(cac, a, 0) == 1);
+    TS_ASSERT(Word::find(cac, abc) == std::string::npos);
+
+    TS_ASSERT(Word::rfind(aaaaa, empty) == 0);
+    TS_ASSERT(Word::rfind(aaaaa, a) == 0);
+    TS_ASSERT(Word::rfind(abca, abc, 1) == 1);
+
+    TS_ASSERT(Word::hasPrefix(empty, empty));
+    TS_ASSERT(Word::hasPrefix(a, empty));
+    TS_ASSERT(Word::hasPrefix(aaaaa, a));
+    TS_ASSERT(!Word::hasPrefix(abca, b));
+    TS_ASSERT(!Word::hasPrefix(empty, a));
+
+    TS_ASSERT(Word::hasSuffix(empty, empty));
+    TS_ASSERT(Word::hasSuffix(a, empty));
+    TS_ASSERT(Word::hasSuffix(a, a));
+    TS_ASSERT(!Word::hasSuffix(abca, b));
+    TS_ASSERT(!Word::hasSuffix(empty, abc));
+
+    TS_ASSERT_EQUALS(bbc, Word::replace(abc, a, b));
+    TS_ASSERT_EQUALS(aaaaa, Word::replace(aaaaa, b, a));
+    TS_ASSERT_EQUALS(aa, Word::replace(a, empty, a));
+
+    TS_ASSERT_EQUALS(empty, Word::substr(a, 1));
+    TS_ASSERT_EQUALS(empty, Word::substr(empty, 0));
+    TS_ASSERT_EQUALS(a, Word::substr(abca, 3));
+
+    TS_ASSERT_EQUALS(a, Word::substr(abc, 0, 1));
+    TS_ASSERT_EQUALS(aa, Word::substr(aaaaa, 3, 2));
+
+    TS_ASSERT_EQUALS(a, Word::prefix(a, 1));
+    TS_ASSERT_EQUALS(empty, Word::prefix(empty, 0));
+    TS_ASSERT_EQUALS(a, Word::prefix(aaaaa, 1));
+
+    TS_ASSERT_EQUALS(a, Word::suffix(a, 1));
+    TS_ASSERT_EQUALS(empty, Word::suffix(empty, 0));
+    TS_ASSERT_EQUALS(aa, Word::suffix(aaaaa, 2));
+
+    TS_ASSERT(!Word::noOverlapWith(abc, empty));
+    TS_ASSERT(Word::noOverlapWith(cac, aa));
+    TS_ASSERT(!Word::noOverlapWith(cac, abc));
+    TS_ASSERT(Word::noOverlapWith(cac, b));
+    TS_ASSERT(!Word::noOverlapWith(cac, a));
+    TS_ASSERT(!Word::noOverlapWith(abca, a));
+
+    TS_ASSERT(Word::overlap(abc, empty) == 0);
+    TS_ASSERT(Word::overlap(aaaaa, abc) == 1);
+    TS_ASSERT(Word::overlap(cac, abc) == 0);
+    TS_ASSERT(Word::overlap(empty, abc) == 0);
+    TS_ASSERT(Word::overlap(aaaaa, aa) == 2);
+
+    TS_ASSERT(Word::roverlap(abc, empty) == 0);
+    TS_ASSERT(Word::roverlap(aaaaa, abc) == 0);
+    TS_ASSERT(Word::roverlap(cac, abc) == 1);
+    TS_ASSERT(Word::roverlap(empty, abc) == 0);
+    TS_ASSERT(Word::roverlap(aaaaa, aa) == 2);
+  }
+
+ private:
+  ExprManager* d_em;
+  SmtEngine* d_smt;
+  SmtScope* d_scope;
+
+  NodeManager* d_nm;
+};
-- 
2.30.2