From: Andrew Reynolds <andrew.j.reynolds@gmail.com>
Date: Fri, 27 Mar 2020 19:04:44 +0000 (-0500)
Subject: Move string utility file (#4164)
X-Git-Tag: cvc5-1.0.0~3437
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4b7fc20dcce9eefdf568937f5e2c54141c4f5c5b;p=cvc5.git

Move string utility file (#4164)

Moves the string file to string.h. This is required since other required utilities will soon need to be added to regexp.h.
---

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dd58c74ee..809b00b04 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -961,11 +961,11 @@ install(FILES
           util/proof.h
           util/rational_cln_imp.h
           util/rational_gmp_imp.h
-          util/regexp.h
           util/resource_manager.h
           util/result.h
           util/sexpr.h
           util/statistics.h
+          util/string.h
           util/tuple.h
           util/unsafe_interrupt_exception.h
           ${CMAKE_CURRENT_BINARY_DIR}/util/floatingpoint.h
diff --git a/src/cvc4.i b/src/cvc4.i
index f9f8f5743..9dcff7f8e 100644
--- a/src/cvc4.i
+++ b/src/cvc4.i
@@ -274,10 +274,10 @@ std::set<JavaInputStreamAdapter*> CVC4::JavaInputStreamAdapter::s_adapters;
 %include "util/cardinality.i"
 %include "util/hash.i"
 %include "util/proof.i"
-%include "util/regexp.i"
 %include "util/result.i"
 %include "util/sexpr.i"
 %include "util/statistics.i"
+%include "util/string.i"
 %include "util/tuple.i"
 %include "util/unsafe_interrupt_exception.i"
 
diff --git a/src/theory/evaluator.h b/src/theory/evaluator.h
index 58e179fbe..b9b15c6c6 100644
--- a/src/theory/evaluator.h
+++ b/src/theory/evaluator.h
@@ -27,7 +27,7 @@
 #include "expr/node.h"
 #include "util/bitvector.h"
 #include "util/rational.h"
-#include "util/regexp.h"
+#include "util/string.h"
 
 namespace CVC4 {
 namespace theory {
diff --git a/src/theory/strings/kinds b/src/theory/strings/kinds
index 5b988061b..3f7abdb7c 100644
--- a/src/theory/strings/kinds
+++ b/src/theory/strings/kinds
@@ -37,15 +37,15 @@ operator STRING_REV 1 "string reverse"
 sort STRING_TYPE \
     Cardinality::INTEGERS \
     well-founded \
-        "NodeManager::currentNM()->mkConst(::CVC4::String())" \
-        "util/regexp.h" \
+    "NodeManager::currentNM()->mkConst(::CVC4::String())" \
+    "util/string.h" \
     "String type"
 
 sort REGEXP_TYPE \
     Cardinality::INTEGERS \
     well-founded \
-        "NodeManager::currentNM()->mkNode(REGEXP_EMPTY, std::vector<Node>() )" \
-        "util/regexp.h" \
+    "NodeManager::currentNM()->mkNode(REGEXP_EMPTY, std::vector<Node>() )" \
+    "util/string.h" \
     "RegExp type"
 
 enumerator STRING_TYPE \
@@ -55,7 +55,7 @@ enumerator STRING_TYPE \
 constant CONST_STRING \
     ::CVC4::String \
     ::CVC4::strings::StringHashFunction \
-    "util/regexp.h" \
+    "util/string.h" \
     "a string of characters"
 
 # equal equal / less than / output
diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h
index b9dbedba5..7845b2e00 100644
--- a/src/theory/strings/regexp_operation.h
+++ b/src/theory/strings/regexp_operation.h
@@ -24,10 +24,9 @@
 #include <algorithm>
 #include <climits>
 #include "util/hash.h"
-#include "util/regexp.h"
+#include "util/string.h"
 #include "theory/theory.h"
 #include "theory/rewriter.h"
-//#include "context/cdhashmap.h"
 
 namespace CVC4 {
 namespace theory {
diff --git a/src/theory/strings/regexp_solver.h b/src/theory/strings/regexp_solver.h
index 4880af905..d18604752 100644
--- a/src/theory/strings/regexp_solver.h
+++ b/src/theory/strings/regexp_solver.h
@@ -27,7 +27,7 @@
 #include "theory/strings/inference_manager.h"
 #include "theory/strings/regexp_operation.h"
 #include "theory/strings/solver_state.h"
-#include "util/regexp.h"
+#include "util/string.h"
 
 namespace CVC4 {
 namespace theory {
diff --git a/src/theory/strings/type_enumerator.cpp b/src/theory/strings/type_enumerator.cpp
index 7352ae5de..d24206860 100644
--- a/src/theory/strings/type_enumerator.cpp
+++ b/src/theory/strings/type_enumerator.cpp
@@ -15,7 +15,7 @@
 #include "theory/strings/type_enumerator.h"
 
 #include "theory/strings/theory_strings_utils.h"
-#include "util/regexp.h"
+#include "util/string.h"
 
 namespace CVC4 {
 namespace theory {
diff --git a/src/theory/strings/word.cpp b/src/theory/strings/word.cpp
index dd573b68c..0faeffd99 100644
--- a/src/theory/strings/word.cpp
+++ b/src/theory/strings/word.cpp
@@ -14,7 +14,7 @@
 
 #include "theory/strings/word.h"
 
-#include "util/regexp.h"
+#include "util/string.h"
 
 using namespace CVC4::kind;
 
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index 75597edac..6895dc01a 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -25,8 +25,6 @@ libcvc4_add_sources(
   proof.h
   random.cpp
   random.h
-  regexp.cpp
-  regexp.h
   resource_manager.cpp
   resource_manager.h
   result.cpp
@@ -43,6 +41,8 @@ libcvc4_add_sources(
   statistics.h
   statistics_registry.cpp
   statistics_registry.h
+  string.cpp
+  string.h
   tuple.h
   unsafe_interrupt_exception.h
   utility.cpp
diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp
deleted file mode 100644
index 36ba7182b..000000000
--- a/src/util/regexp.cpp
+++ /dev/null
@@ -1,488 +0,0 @@
-/*********************                                                        */
-/*! \file regexp.cpp
- ** \verbatim
- ** Top contributors (to current version):
- **   Tim King, Tianyi Liang, Andrew Reynolds
- ** This file is part of the CVC4 project.
- ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
- ** in the top-level source directory) and their institutional affiliations.
- ** All rights reserved.  See the file COPYING in the top-level source
- ** directory for licensing information.\endverbatim
- **
- ** \brief [[ Add one-line brief description here ]]
- **
- ** [[ Add lengthier description here ]]
- ** \todo document this file
- **/
-
-#include "util/regexp.h"
-
-#include <algorithm>
-#include <climits>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-
-#include "base/check.h"
-#include "base/exception.h"
-
-using namespace std;
-
-namespace CVC4 {
-
-static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values.");
-
-String::String(const std::vector<unsigned> &s) : d_str(s)
-{
-#ifdef CVC4_ASSERTIONS
-  for (unsigned u : d_str)
-  {
-    Assert(u < num_codes());
-  }
-#endif
-}
-
-int String::cmp(const String &y) const {
-  if (size() != y.size()) {
-    return size() < y.size() ? -1 : 1;
-  }
-  for (unsigned int i = 0; i < size(); ++i) {
-    if (d_str[i] != y.d_str[i]) {
-      unsigned cp = d_str[i];
-      unsigned cpy = y.d_str[i];
-      return cp < cpy ? -1 : 1;
-    }
-  }
-  return 0;
-}
-
-String String::concat(const String &other) const {
-  std::vector<unsigned int> ret_vec(d_str);
-  ret_vec.insert(ret_vec.end(), other.d_str.begin(), other.d_str.end());
-  return String(ret_vec);
-}
-
-bool String::strncmp(const String& y, std::size_t n) const
-{
-  std::size_t b = (size() >= y.size()) ? size() : y.size();
-  std::size_t s = (size() <= y.size()) ? size() : y.size();
-  if (n > s) {
-    if (b == s) {
-      n = s;
-    } else {
-      return false;
-    }
-  }
-  for (std::size_t i = 0; i < n; ++i) {
-    if (d_str[i] != y.d_str[i]) return false;
-  }
-  return true;
-}
-
-bool String::rstrncmp(const String& y, std::size_t n) const
-{
-  std::size_t b = (size() >= y.size()) ? size() : y.size();
-  std::size_t s = (size() <= y.size()) ? size() : y.size();
-  if (n > s) {
-    if (b == s) {
-      n = s;
-    } else {
-      return false;
-    }
-  }
-  for (std::size_t i = 0; i < n; ++i) {
-    if (d_str[size() - i - 1] != y.d_str[y.size() - i - 1]) return false;
-  }
-  return true;
-}
-
-void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str)
-{
-  // if not a printable character
-  if (ch > 127 || ch < 32)
-  {
-    std::stringstream serr;
-    serr << "Illegal string character: \"" << ch
-         << "\", must use escape sequence";
-    throw CVC4::Exception(serr.str());
-  }
-  else
-  {
-    str.push_back(static_cast<unsigned>(ch));
-  }
-}
-
-std::vector<unsigned> String::toInternal(const std::string& s,
-                                         bool useEscSequences)
-{
-  std::vector<unsigned> str;
-  unsigned i = 0;
-  while (i < s.size())
-  {
-    // get the current character
-    char si = s[i];
-    if (si != '\\' || !useEscSequences)
-    {
-      addCharToInternal(si, str);
-      ++i;
-      continue;
-    }
-    // the vector of characters, in case we fail to read an escape sequence
-    std::vector<unsigned> nonEscCache;
-    // process the '\'
-    addCharToInternal(si, nonEscCache);
-    ++i;
-    // are we an escape sequence?
-    bool isEscapeSequence = true;
-    // the string corresponding to the hexidecimal code point
-    std::stringstream hexString;
-    // is the slash followed by a 'u'? Could be last character.
-    if (i >= s.size() || s[i] != 'u')
-    {
-      isEscapeSequence = false;
-    }
-    else
-    {
-      // process the 'u'
-      addCharToInternal(s[i], nonEscCache);
-      ++i;
-      bool isStart = true;
-      bool isEnd = false;
-      bool hasBrace = false;
-      while (i < s.size())
-      {
-        // add the next character
-        si = s[i];
-        if (isStart)
-        {
-          isStart = false;
-          // possibly read '{'
-          if (si == '{')
-          {
-            hasBrace = true;
-            addCharToInternal(si, nonEscCache);
-            ++i;
-            continue;
-          }
-        }
-        else if (si == '}')
-        {
-          // can only end if we had an open brace and read at least one digit
-          isEscapeSequence = hasBrace && !hexString.str().empty();
-          isEnd = true;
-          addCharToInternal(si, nonEscCache);
-          ++i;
-          break;
-        }
-        // must be a hex digit at this point
-        if (!isHexDigit(static_cast<unsigned>(si)))
-        {
-          isEscapeSequence = false;
-          break;
-        }
-        hexString << si;
-        addCharToInternal(si, nonEscCache);
-        ++i;
-        if (!hasBrace && hexString.str().size() == 4)
-        {
-          // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens
-          isEnd = true;
-          break;
-        }
-        else if (hasBrace && hexString.str().size() > 5)
-        {
-          // too many digits enclosed in brace, not an escape sequence
-          isEscapeSequence = false;
-          break;
-        }
-      }
-      if (!isEnd)
-      {
-        // if we were interupted before ending, then this is not a valid
-        // escape sequence
-        isEscapeSequence = false;
-      }
-    }
-    if (isEscapeSequence)
-    {
-      Assert(!hexString.str().empty() && hexString.str().size() <= 5);
-      // Otherwise, we add the escaped character.
-      // This is guaranteed not to overflow due to the length of hstr.
-      uint32_t val;
-      hexString >> std::hex >> val;
-      if (val > num_codes())
-      {
-        // Failed due to being out of range. This can happen for strings of
-        // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not
-        // in the range [0-2].
-        isEscapeSequence = false;
-      }
-      else
-      {
-        str.push_back(val);
-      }
-    }
-    // if we did not successfully parse an escape sequence, we add back all
-    // characters that we cached
-    if (!isEscapeSequence)
-    {
-      str.insert(str.end(), nonEscCache.begin(), nonEscCache.end());
-    }
-  }
-#ifdef CVC4_ASSERTIONS
-  for (unsigned u : str)
-  {
-    Assert(u < num_codes());
-  }
-#endif
-  return str;
-}
-
-unsigned String::front() const
-{
-  Assert(!d_str.empty());
-  return d_str.front();
-}
-
-unsigned String::back() const
-{
-  Assert(!d_str.empty());
-  return d_str.back();
-}
-
-std::size_t String::overlap(const String &y) const {
-  std::size_t i = size() < y.size() ? size() : y.size();
-  for (; i > 0; i--) {
-    String s = suffix(i);
-    String p = y.prefix(i);
-    if (s == p) {
-      return i;
-    }
-  }
-  return i;
-}
-
-std::size_t String::roverlap(const String &y) const {
-  std::size_t i = size() < y.size() ? size() : y.size();
-  for (; i > 0; i--) {
-    String s = prefix(i);
-    String p = y.suffix(i);
-    if (s == p) {
-      return i;
-    }
-  }
-  return i;
-}
-
-std::string String::toString(bool useEscSequences) const {
-  std::stringstream str;
-  for (unsigned int i = 0; i < size(); ++i) {
-    // we always print forward slash as a code point so that it cannot
-    // be interpreted as specifying part of a code point, e.g. the string
-    // '\' + 'u' + '0' of length three.
-    if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences)
-    {
-      str << static_cast<char>(d_str[i]);
-    }
-    else
-    {
-      std::stringstream ss;
-      ss << std::hex << d_str[i];
-      str << "\\u{" << ss.str() << "}";
-    }
-  }
-  return str.str();
-}
-
-bool String::isLeq(const String &y) const
-{
-  for (unsigned i = 0; i < size(); ++i)
-  {
-    if (i >= y.size())
-    {
-      return false;
-    }
-    unsigned ci = d_str[i];
-    unsigned cyi = y.d_str[i];
-    if (ci > cyi)
-    {
-      return false;
-    }
-    if (ci < cyi)
-    {
-      return true;
-    }
-  }
-  return true;
-}
-
-bool String::isRepeated() const {
-  if (size() > 1) {
-    unsigned int f = d_str[0];
-    for (unsigned i = 1; i < size(); ++i) {
-      if (f != d_str[i]) return false;
-    }
-  }
-  return true;
-}
-
-bool String::tailcmp(const String &y, int &c) const {
-  int id_x = size() - 1;
-  int id_y = y.size() - 1;
-  while (id_x >= 0 && id_y >= 0) {
-    if (d_str[id_x] != y.d_str[id_y]) {
-      c = id_x;
-      return false;
-    }
-    --id_x;
-    --id_y;
-  }
-  c = id_x == -1 ? (-(id_y + 1)) : (id_x + 1);
-  return true;
-}
-
-std::size_t String::find(const String &y, const std::size_t start) const {
-  if (size() < y.size() + start) return std::string::npos;
-  if (y.empty()) return start;
-  if (empty()) return std::string::npos;
-
-  std::vector<unsigned>::const_iterator itr = std::search(
-      d_str.begin() + start, d_str.end(), y.d_str.begin(), y.d_str.end());
-  if (itr != d_str.end()) {
-    return itr - d_str.begin();
-  }
-  return std::string::npos;
-}
-
-std::size_t String::rfind(const String &y, const std::size_t start) const {
-  if (size() < y.size() + start) return std::string::npos;
-  if (y.empty()) return start;
-  if (empty()) return std::string::npos;
-
-  std::vector<unsigned>::const_reverse_iterator itr = std::search(
-      d_str.rbegin() + start, d_str.rend(), y.d_str.rbegin(), y.d_str.rend());
-  if (itr != d_str.rend()) {
-    return itr - d_str.rbegin();
-  }
-  return std::string::npos;
-}
-
-bool String::hasPrefix(const String& y) const
-{
-  size_t s = size();
-  size_t ys = y.size();
-  if (ys > s)
-  {
-    return false;
-  }
-  for (size_t i = 0; i < ys; i++)
-  {
-    if (d_str[i] != y.d_str[i])
-    {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool String::hasSuffix(const String& y) const
-{
-  size_t s = size();
-  size_t ys = y.size();
-  if (ys > s)
-  {
-    return false;
-  }
-  size_t idiff = s - ys;
-  for (size_t i = 0; i < ys; i++)
-  {
-    if (d_str[i + idiff] != y.d_str[i])
-    {
-      return false;
-    }
-  }
-  return true;
-}
-
-String String::replace(const String &s, const String &t) const {
-  std::size_t ret = find(s);
-  if (ret != std::string::npos) {
-    std::vector<unsigned int> vec;
-    vec.insert(vec.begin(), d_str.begin(), d_str.begin() + ret);
-    vec.insert(vec.end(), t.d_str.begin(), t.d_str.end());
-    vec.insert(vec.end(), d_str.begin() + ret + s.size(), d_str.end());
-    return String(vec);
-  } else {
-    return *this;
-  }
-}
-
-String String::substr(std::size_t i) const {
-  Assert(i <= size());
-  std::vector<unsigned int> ret_vec;
-  std::vector<unsigned int>::const_iterator itr = d_str.begin() + i;
-  ret_vec.insert(ret_vec.end(), itr, d_str.end());
-  return String(ret_vec);
-}
-
-String String::substr(std::size_t i, std::size_t j) const {
-  Assert(i + j <= size());
-  std::vector<unsigned int> ret_vec;
-  std::vector<unsigned int>::const_iterator itr = d_str.begin() + i;
-  ret_vec.insert(ret_vec.end(), itr, itr + j);
-  return String(ret_vec);
-}
-
-bool String::noOverlapWith(const String& y) const
-{
-  return y.find(*this) == std::string::npos
-         && this->find(y) == std::string::npos && this->overlap(y) == 0
-         && y.overlap(*this) == 0;
-}
-
-bool String::isNumber() const {
-  if (d_str.empty()) {
-    return false;
-  }
-  for (unsigned character : d_str) {
-    if (!isDigit(character))
-    {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool String::isDigit(unsigned character)
-{
-  // '0' to '9'
-  return 48 <= character && character <= 57;
-}
-
-bool String::isHexDigit(unsigned character)
-{
-  // '0' to '9' or 'A' to 'F' or 'a' to 'f'
-  return isDigit(character) || (65 <= character && character <= 70)
-         || (97 <= character && character <= 102);
-}
-
-bool String::isPrintable(unsigned character)
-{
-  // Unicode 0x00020 (' ') to 0x0007E ('~')
-  return 32 <= character && character <= 126;
-}
-
-size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); }
-
-Rational String::toNumber() const
-{
-  // when smt2 standard for strings is set, this may change, based on the
-  // semantics of str.from.int for leading zeros
-  return Rational(toString());
-}
-
-std::ostream &operator<<(std::ostream &os, const String &s) {
-  return os << "\"" << s.toString(true) << "\"";
-}
-
-}  // namespace CVC4
diff --git a/src/util/regexp.h b/src/util/regexp.h
deleted file mode 100644
index 56fb969a3..000000000
--- a/src/util/regexp.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/*********************                                                        */
-/*! \file regexp.h
- ** \verbatim
- ** Top contributors (to current version):
- **   Andrew Reynolds, Tim King, Tianyi Liang
- ** This file is part of the CVC4 project.
- ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
- ** in the top-level source directory) and their institutional affiliations.
- ** All rights reserved.  See the file COPYING in the top-level source
- ** directory for licensing information.\endverbatim
- **
- ** \brief [[ Add one-line brief description here ]]
- **
- ** [[ Add lengthier description here ]]
- ** \todo document this file
- **/
-
-#include "cvc4_public.h"
-
-#ifndef CVC4__REGEXP_H
-#define CVC4__REGEXP_H
-
-#include <cstddef>
-#include <functional>
-#include <ostream>
-#include <string>
-#include <vector>
-#include "util/rational.h"
-
-namespace CVC4 {
-
-/** The CVC4 string class
- *
- * This data structure is the domain of values for the string type. It can also
- * be used as a generic utility for representing strings.
- */
-class CVC4_PUBLIC String {
- public:
-  /**
-   * This is the cardinality of the alphabet that is representable by this
-   * class. Notice that this must be greater than or equal to the cardinality
-   * of the alphabet that the string theory reasons about.
-   *
-   * This must be strictly less than std::numeric_limits<unsigned>::max().
-   *
-   * As per the SMT-LIB standard for strings, we support the first 3 planes of
-   * Unicode characters, where 196608 = 3*16^4.
-   */
-  static inline unsigned num_codes() { return 196608; }
-  /** constructors for String
-   *
-   * Internally, a CVC4::String is represented by a vector of unsigned
-   * integers (d_str) representing the code points of the characters.
-   *
-   * To build a string from a C++ string, we may process escape sequences
-   * according to the SMT-LIB standard. In particular, if useEscSequences is
-   * true, we convert unicode escape sequences:
-   *  \u d_3 d_2 d_1 d_0
-   *  \u{d_0}
-   *  \u{d_1 d_0}
-   *  \u{d_2 d_1 d_0}
-   *  \u{d_3 d_2 d_1 d_0}
-   *  \u{d_4 d_3 d_2 d_1 d_0}
-   * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
-   *
-   * If useEscSequences is false, then the characters of the constructed
-   * CVC4::String correspond one-to-one with the input string.
-   */
-  String() = default;
-  explicit String(const std::string& s, bool useEscSequences = false)
-      : d_str(toInternal(s, useEscSequences))
-  {
-  }
-  explicit String(const char* s, bool useEscSequences = false)
-      : d_str(toInternal(std::string(s), useEscSequences))
-  {
-  }
-  explicit String(const std::vector<unsigned>& s);
-
-  String& operator=(const String& y) {
-    if (this != &y) {
-      d_str = y.d_str;
-    }
-    return *this;
-  }
-
-  String concat(const String& other) const;
-
-  bool operator==(const String& y) const { return cmp(y) == 0; }
-  bool operator!=(const String& y) const { return cmp(y) != 0; }
-  bool operator<(const String& y) const { return cmp(y) < 0; }
-  bool operator>(const String& y) const { return cmp(y) > 0; }
-  bool operator<=(const String& y) const { return cmp(y) <= 0; }
-  bool operator>=(const String& y) const { return cmp(y) >= 0; }
-
-  /**
-   * Returns true if this string is equal to y for their first n characters.
-   * If n is larger than the length of this string or y, this method returns
-   * true if and only if this string is equal to y.
-   */
-  bool strncmp(const String& y, std::size_t n) const;
-  /**
-   * Returns true if this string is equal to y for their last n characters.
-   * Similar to strncmp, if n is larger than the length of this string or y,
-   * this method returns true if and only if this string is equal to y.
-   */
-  bool rstrncmp(const String& y, std::size_t n) const;
-
-  /* toString
-   * Converts this string to a std::string.
-   *
-   * The unprintable characters are converted to unicode escape sequences as
-   * described above.
-   *
-   * If useEscSequences is false, the string's printable characters are
-   * printed as characters. Notice that for all std::string s having only
-   * printable characters, we have that
-   *    CVC4::String( s ).toString() = s.
-   */
-  std::string toString(bool useEscSequences = false) const;
-  /** is this the empty string? */
-  bool empty() const { return d_str.empty(); }
-  /** is this the empty string? */
-  bool isEmptyString() const { return empty(); }
-  /** is less than or equal to string y */
-  bool isLeq(const String& y) const;
-  /** Return the length of the string */
-  std::size_t size() const { return d_str.size(); }
-
-  bool isRepeated() const;
-  bool tailcmp(const String& y, int& c) const;
-
-  /**
-   * Return the first position y occurs in this string, or std::string::npos
-   * otherwise.
-   */
-  std::size_t find(const String& y, const std::size_t start = 0) const;
-  /**
-   * Return the first position y occurs in this string searching from the end,
-   * or std::string::npos otherwise.
-   */
-  std::size_t rfind(const String& y, const std::size_t start = 0) const;
-  /** Returns true if y is a prefix of this */
-  bool hasPrefix(const String& y) const;
-  /** Returns true if y is a suffix of this */
-  bool hasSuffix(const String& y) const;
-  /** Replace the first occurrence of s in this string with t */
-  String replace(const String& s, const String& t) const;
-  /** Return the substring of this string starting at index i */
-  String substr(std::size_t i) const;
-  /** Return the substring of this string starting at index i with size at most
-   * j */
-  String substr(std::size_t i, std::size_t j) const;
-  /** Return the prefix of this string of size at most i */
-  String prefix(std::size_t i) const { return substr(0, i); }
-  /** Return the suffix of this string of size at most i */
-  String suffix(std::size_t i) const { return substr(size() - i, i); }
-
-  /**
-   * Checks if there is any overlap between this string and another string. This
-   * corresponds to checking whether one string contains the other and whether a
-   * substring of one is a prefix of the other and vice-versa.
-   *
-   * @param y The other string
-   * @return True if there is an overlap, false otherwise
-   */
-  bool noOverlapWith(const String& y) const;
-
-  /** string overlap
-  *
-  * if overlap returns m>0,
-  * then the maximal suffix of this string that is a prefix of y is of length m.
-  *
-  * For example, if x is "abcdef", then:
-  * x.overlap("defg") = 3
-  * x.overlap("ab") = 0
-  * x.overlap("d") = 0
-  * x.overlap("bcdefdef") = 5
-  */
-  std::size_t overlap(const String& y) const;
-  /** string reverse overlap
-  *
-  * if roverlap returns m>0,
-  * then the maximal prefix of this string that is a suffix of y is of length m.
-  *
-  * For example, if x is "abcdef", then:
-  * x.roverlap("aaabc") = 3
-  * x.roverlap("def") = 0
-  * x.roverlap("d") = 0
-  * x.roverlap("defabcde") = 5
-  *
-  * Notice that x.overlap(y) = y.roverlap(x)
-  */
-  std::size_t roverlap(const String& y) const;
-
-  /**
-   * Returns true if this string corresponds in text to a number, for example
-   * this returns true for strings "7", "12", "004", "0" and false for strings
-   * "abc", "4a", "-4", "".
-   */
-  bool isNumber() const;
-  /** Returns the corresponding rational for the text of this string. */
-  Rational toNumber() const;
-  /** Get the unsigned representation (code points) of this string */
-  const std::vector<unsigned>& getVec() const { return d_str; }
-  /**
-   * Get the unsigned (code point) value of the first character in this string
-   */
-  unsigned front() const;
-  /**
-   * Get the unsigned (code point) value of the last character in this string
-   */
-  unsigned back() const;
-  /** is the unsigned a digit?
-   *
-   * This is true for code points between 48 ('0') and 57 ('9').
-   */
-  static bool isDigit(unsigned character);
-  /** is the unsigned a hexidecimal digit?
-   *
-   * This is true for code points between 48 ('0') and 57 ('9'), code points
-   * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
-   */
-  static bool isHexDigit(unsigned character);
-  /** is the unsigned a printable code point?
-   *
-   * This is true for Unicode 32 (' ') to 126 ('~').
-   */
-  static bool isPrintable(unsigned character);
-
-  /**
-   * Returns the maximum length of string representable by this class.
-   * Corresponds to the maximum size of d_str.
-   */
-  static size_t maxSize();
- private:
-  /**
-   * Helper for toInternal: add character ch to vector vec, storing a string in
-   * internal format. This throws an error if ch is not a printable character,
-   * since non-printable characters must be escaped in SMT-LIB.
-   */
-  static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
-  /**
-   * Convert the string s to the internal format (vector of code points).
-   * The argument useEscSequences is whether to process unicode escape
-   * sequences.
-   */
-  static std::vector<unsigned> toInternal(const std::string& s,
-                                          bool useEscSequences);
-
-  /**
-   * Returns a negative number if *this < y, 0 if *this and y are equal and a
-   * positive number if *this > y.
-   */
-  int cmp(const String& y) const;
-
-  std::vector<unsigned> d_str;
-}; /* class String */
-
-namespace strings {
-
-struct CVC4_PUBLIC StringHashFunction {
-  size_t operator()(const ::CVC4::String& s) const {
-    return std::hash<std::string>()(s.toString());
-  }
-}; /* struct StringHashFunction */
-
-}  // namespace strings
-
-std::ostream& operator<<(std::ostream& os, const String& s) CVC4_PUBLIC;
-
-}  // namespace CVC4
-
-#endif /* CVC4__REGEXP_H */
diff --git a/src/util/regexp.i b/src/util/regexp.i
deleted file mode 100644
index afc51abd7..000000000
--- a/src/util/regexp.i
+++ /dev/null
@@ -1,25 +0,0 @@
-%{
-#include "util/regexp.h"
-%}
-
-%rename(CVC4String) String;
-%rename(CVC4StringHashFunction) CVC4::strings::StringHashFunction;
-
-%ignore CVC4::String::String(const std::string&);
-
-%rename(assign) CVC4::String::operator=(const String&);
-%rename(getChar) CVC4::String::operator[](const unsigned int) const;
-%rename(equals) CVC4::String::operator==(const String&) const;
-%ignore CVC4::String::operator!=(const String&) const;
-%rename(less) CVC4::String::operator<(const String&) const;
-%rename(lessEqual) CVC4::String::operator<=(const String&) const;
-%rename(greater) CVC4::String::operator>(const String&) const;
-%rename(greaterEqual) CVC4::String::operator>=(const String&) const;
-
-%rename(apply) CVC4::strings::StringHashFunction::operator()(const ::CVC4::String&) const;
-
-%ignore CVC4::operator<<(std::ostream&, const String&);
-
-%apply int &OUTPUT { int &c };
-%include "util/regexp.h"
-%clear int &c;
diff --git a/src/util/string.cpp b/src/util/string.cpp
new file mode 100644
index 000000000..ff522ba7b
--- /dev/null
+++ b/src/util/string.cpp
@@ -0,0 +1,485 @@
+/*********************                                                        */
+/*! \file string.cpp
+ ** \verbatim
+ ** Top contributors (to current version):
+ **   Tim King, Tianyi Liang, Andrew Reynolds
+ ** This file is part of the CVC4 project.
+ ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
+ ** in the top-level source directory) and their institutional affiliations.
+ ** All rights reserved.  See the file COPYING in the top-level source
+ ** directory for licensing information.\endverbatim
+ **
+ ** \brief Implementation of the string data type.
+ **/
+
+#include "util/string.h"
+
+#include <algorithm>
+#include <climits>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "base/check.h"
+#include "base/exception.h"
+
+using namespace std;
+
+namespace CVC4 {
+
+static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values.");
+
+String::String(const std::vector<unsigned> &s) : d_str(s)
+{
+#ifdef CVC4_ASSERTIONS
+  for (unsigned u : d_str)
+  {
+    Assert(u < num_codes());
+  }
+#endif
+}
+
+int String::cmp(const String &y) const {
+  if (size() != y.size()) {
+    return size() < y.size() ? -1 : 1;
+  }
+  for (unsigned int i = 0; i < size(); ++i) {
+    if (d_str[i] != y.d_str[i]) {
+      unsigned cp = d_str[i];
+      unsigned cpy = y.d_str[i];
+      return cp < cpy ? -1 : 1;
+    }
+  }
+  return 0;
+}
+
+String String::concat(const String &other) const {
+  std::vector<unsigned int> ret_vec(d_str);
+  ret_vec.insert(ret_vec.end(), other.d_str.begin(), other.d_str.end());
+  return String(ret_vec);
+}
+
+bool String::strncmp(const String& y, std::size_t n) const
+{
+  std::size_t b = (size() >= y.size()) ? size() : y.size();
+  std::size_t s = (size() <= y.size()) ? size() : y.size();
+  if (n > s) {
+    if (b == s) {
+      n = s;
+    } else {
+      return false;
+    }
+  }
+  for (std::size_t i = 0; i < n; ++i) {
+    if (d_str[i] != y.d_str[i]) return false;
+  }
+  return true;
+}
+
+bool String::rstrncmp(const String& y, std::size_t n) const
+{
+  std::size_t b = (size() >= y.size()) ? size() : y.size();
+  std::size_t s = (size() <= y.size()) ? size() : y.size();
+  if (n > s) {
+    if (b == s) {
+      n = s;
+    } else {
+      return false;
+    }
+  }
+  for (std::size_t i = 0; i < n; ++i) {
+    if (d_str[size() - i - 1] != y.d_str[y.size() - i - 1]) return false;
+  }
+  return true;
+}
+
+void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str)
+{
+  // if not a printable character
+  if (ch > 127 || ch < 32)
+  {
+    std::stringstream serr;
+    serr << "Illegal string character: \"" << ch
+         << "\", must use escape sequence";
+    throw CVC4::Exception(serr.str());
+  }
+  else
+  {
+    str.push_back(static_cast<unsigned>(ch));
+  }
+}
+
+std::vector<unsigned> String::toInternal(const std::string& s,
+                                         bool useEscSequences)
+{
+  std::vector<unsigned> str;
+  unsigned i = 0;
+  while (i < s.size())
+  {
+    // get the current character
+    char si = s[i];
+    if (si != '\\' || !useEscSequences)
+    {
+      addCharToInternal(si, str);
+      ++i;
+      continue;
+    }
+    // the vector of characters, in case we fail to read an escape sequence
+    std::vector<unsigned> nonEscCache;
+    // process the '\'
+    addCharToInternal(si, nonEscCache);
+    ++i;
+    // are we an escape sequence?
+    bool isEscapeSequence = true;
+    // the string corresponding to the hexidecimal code point
+    std::stringstream hexString;
+    // is the slash followed by a 'u'? Could be last character.
+    if (i >= s.size() || s[i] != 'u')
+    {
+      isEscapeSequence = false;
+    }
+    else
+    {
+      // process the 'u'
+      addCharToInternal(s[i], nonEscCache);
+      ++i;
+      bool isStart = true;
+      bool isEnd = false;
+      bool hasBrace = false;
+      while (i < s.size())
+      {
+        // add the next character
+        si = s[i];
+        if (isStart)
+        {
+          isStart = false;
+          // possibly read '{'
+          if (si == '{')
+          {
+            hasBrace = true;
+            addCharToInternal(si, nonEscCache);
+            ++i;
+            continue;
+          }
+        }
+        else if (si == '}')
+        {
+          // can only end if we had an open brace and read at least one digit
+          isEscapeSequence = hasBrace && !hexString.str().empty();
+          isEnd = true;
+          addCharToInternal(si, nonEscCache);
+          ++i;
+          break;
+        }
+        // must be a hex digit at this point
+        if (!isHexDigit(static_cast<unsigned>(si)))
+        {
+          isEscapeSequence = false;
+          break;
+        }
+        hexString << si;
+        addCharToInternal(si, nonEscCache);
+        ++i;
+        if (!hasBrace && hexString.str().size() == 4)
+        {
+          // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens
+          isEnd = true;
+          break;
+        }
+        else if (hasBrace && hexString.str().size() > 5)
+        {
+          // too many digits enclosed in brace, not an escape sequence
+          isEscapeSequence = false;
+          break;
+        }
+      }
+      if (!isEnd)
+      {
+        // if we were interupted before ending, then this is not a valid
+        // escape sequence
+        isEscapeSequence = false;
+      }
+    }
+    if (isEscapeSequence)
+    {
+      Assert(!hexString.str().empty() && hexString.str().size() <= 5);
+      // Otherwise, we add the escaped character.
+      // This is guaranteed not to overflow due to the length of hstr.
+      uint32_t val;
+      hexString >> std::hex >> val;
+      if (val > num_codes())
+      {
+        // Failed due to being out of range. This can happen for strings of
+        // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not
+        // in the range [0-2].
+        isEscapeSequence = false;
+      }
+      else
+      {
+        str.push_back(val);
+      }
+    }
+    // if we did not successfully parse an escape sequence, we add back all
+    // characters that we cached
+    if (!isEscapeSequence)
+    {
+      str.insert(str.end(), nonEscCache.begin(), nonEscCache.end());
+    }
+  }
+#ifdef CVC4_ASSERTIONS
+  for (unsigned u : str)
+  {
+    Assert(u < num_codes());
+  }
+#endif
+  return str;
+}
+
+unsigned String::front() const
+{
+  Assert(!d_str.empty());
+  return d_str.front();
+}
+
+unsigned String::back() const
+{
+  Assert(!d_str.empty());
+  return d_str.back();
+}
+
+std::size_t String::overlap(const String &y) const {
+  std::size_t i = size() < y.size() ? size() : y.size();
+  for (; i > 0; i--) {
+    String s = suffix(i);
+    String p = y.prefix(i);
+    if (s == p) {
+      return i;
+    }
+  }
+  return i;
+}
+
+std::size_t String::roverlap(const String &y) const {
+  std::size_t i = size() < y.size() ? size() : y.size();
+  for (; i > 0; i--) {
+    String s = prefix(i);
+    String p = y.suffix(i);
+    if (s == p) {
+      return i;
+    }
+  }
+  return i;
+}
+
+std::string String::toString(bool useEscSequences) const {
+  std::stringstream str;
+  for (unsigned int i = 0; i < size(); ++i) {
+    // we always print forward slash as a code point so that it cannot
+    // be interpreted as specifying part of a code point, e.g. the string
+    // '\' + 'u' + '0' of length three.
+    if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences)
+    {
+      str << static_cast<char>(d_str[i]);
+    }
+    else
+    {
+      std::stringstream ss;
+      ss << std::hex << d_str[i];
+      str << "\\u{" << ss.str() << "}";
+    }
+  }
+  return str.str();
+}
+
+bool String::isLeq(const String &y) const
+{
+  for (unsigned i = 0; i < size(); ++i)
+  {
+    if (i >= y.size())
+    {
+      return false;
+    }
+    unsigned ci = d_str[i];
+    unsigned cyi = y.d_str[i];
+    if (ci > cyi)
+    {
+      return false;
+    }
+    if (ci < cyi)
+    {
+      return true;
+    }
+  }
+  return true;
+}
+
+bool String::isRepeated() const {
+  if (size() > 1) {
+    unsigned int f = d_str[0];
+    for (unsigned i = 1; i < size(); ++i) {
+      if (f != d_str[i]) return false;
+    }
+  }
+  return true;
+}
+
+bool String::tailcmp(const String &y, int &c) const {
+  int id_x = size() - 1;
+  int id_y = y.size() - 1;
+  while (id_x >= 0 && id_y >= 0) {
+    if (d_str[id_x] != y.d_str[id_y]) {
+      c = id_x;
+      return false;
+    }
+    --id_x;
+    --id_y;
+  }
+  c = id_x == -1 ? (-(id_y + 1)) : (id_x + 1);
+  return true;
+}
+
+std::size_t String::find(const String &y, const std::size_t start) const {
+  if (size() < y.size() + start) return std::string::npos;
+  if (y.empty()) return start;
+  if (empty()) return std::string::npos;
+
+  std::vector<unsigned>::const_iterator itr = std::search(
+      d_str.begin() + start, d_str.end(), y.d_str.begin(), y.d_str.end());
+  if (itr != d_str.end()) {
+    return itr - d_str.begin();
+  }
+  return std::string::npos;
+}
+
+std::size_t String::rfind(const String &y, const std::size_t start) const {
+  if (size() < y.size() + start) return std::string::npos;
+  if (y.empty()) return start;
+  if (empty()) return std::string::npos;
+
+  std::vector<unsigned>::const_reverse_iterator itr = std::search(
+      d_str.rbegin() + start, d_str.rend(), y.d_str.rbegin(), y.d_str.rend());
+  if (itr != d_str.rend()) {
+    return itr - d_str.rbegin();
+  }
+  return std::string::npos;
+}
+
+bool String::hasPrefix(const String& y) const
+{
+  size_t s = size();
+  size_t ys = y.size();
+  if (ys > s)
+  {
+    return false;
+  }
+  for (size_t i = 0; i < ys; i++)
+  {
+    if (d_str[i] != y.d_str[i])
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool String::hasSuffix(const String& y) const
+{
+  size_t s = size();
+  size_t ys = y.size();
+  if (ys > s)
+  {
+    return false;
+  }
+  size_t idiff = s - ys;
+  for (size_t i = 0; i < ys; i++)
+  {
+    if (d_str[i + idiff] != y.d_str[i])
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+String String::replace(const String &s, const String &t) const {
+  std::size_t ret = find(s);
+  if (ret != std::string::npos) {
+    std::vector<unsigned int> vec;
+    vec.insert(vec.begin(), d_str.begin(), d_str.begin() + ret);
+    vec.insert(vec.end(), t.d_str.begin(), t.d_str.end());
+    vec.insert(vec.end(), d_str.begin() + ret + s.size(), d_str.end());
+    return String(vec);
+  } else {
+    return *this;
+  }
+}
+
+String String::substr(std::size_t i) const {
+  Assert(i <= size());
+  std::vector<unsigned int> ret_vec;
+  std::vector<unsigned int>::const_iterator itr = d_str.begin() + i;
+  ret_vec.insert(ret_vec.end(), itr, d_str.end());
+  return String(ret_vec);
+}
+
+String String::substr(std::size_t i, std::size_t j) const {
+  Assert(i + j <= size());
+  std::vector<unsigned int> ret_vec;
+  std::vector<unsigned int>::const_iterator itr = d_str.begin() + i;
+  ret_vec.insert(ret_vec.end(), itr, itr + j);
+  return String(ret_vec);
+}
+
+bool String::noOverlapWith(const String& y) const
+{
+  return y.find(*this) == std::string::npos
+         && this->find(y) == std::string::npos && this->overlap(y) == 0
+         && y.overlap(*this) == 0;
+}
+
+bool String::isNumber() const {
+  if (d_str.empty()) {
+    return false;
+  }
+  for (unsigned character : d_str) {
+    if (!isDigit(character))
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool String::isDigit(unsigned character)
+{
+  // '0' to '9'
+  return 48 <= character && character <= 57;
+}
+
+bool String::isHexDigit(unsigned character)
+{
+  // '0' to '9' or 'A' to 'F' or 'a' to 'f'
+  return isDigit(character) || (65 <= character && character <= 70)
+         || (97 <= character && character <= 102);
+}
+
+bool String::isPrintable(unsigned character)
+{
+  // Unicode 0x00020 (' ') to 0x0007E ('~')
+  return 32 <= character && character <= 126;
+}
+
+size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); }
+
+Rational String::toNumber() const
+{
+  // when smt2 standard for strings is set, this may change, based on the
+  // semantics of str.from.int for leading zeros
+  return Rational(toString());
+}
+
+std::ostream &operator<<(std::ostream &os, const String &s) {
+  return os << "\"" << s.toString(true) << "\"";
+}
+
+}  // namespace CVC4
diff --git a/src/util/string.h b/src/util/string.h
new file mode 100644
index 000000000..032105812
--- /dev/null
+++ b/src/util/string.h
@@ -0,0 +1,271 @@
+/*********************                                                        */
+/*! \file string.h
+ ** \verbatim
+ ** Top contributors (to current version):
+ **   Andrew Reynolds, Tim King, Tianyi Liang
+ ** This file is part of the CVC4 project.
+ ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
+ ** in the top-level source directory) and their institutional affiliations.
+ ** All rights reserved.  See the file COPYING in the top-level source
+ ** directory for licensing information.\endverbatim
+ **
+ ** \brief The string data type.
+ **/
+
+#include "cvc4_public.h"
+
+#ifndef CVC4__UTIL__STRING_H
+#define CVC4__UTIL__STRING_H
+
+#include <cstddef>
+#include <functional>
+#include <ostream>
+#include <string>
+#include <vector>
+#include "util/rational.h"
+
+namespace CVC4 {
+
+/** The CVC4 string class
+ *
+ * This data structure is the domain of values for the string type. It can also
+ * be used as a generic utility for representing strings.
+ */
+class CVC4_PUBLIC String {
+ public:
+  /**
+   * This is the cardinality of the alphabet that is representable by this
+   * class. Notice that this must be greater than or equal to the cardinality
+   * of the alphabet that the string theory reasons about.
+   *
+   * This must be strictly less than std::numeric_limits<unsigned>::max().
+   *
+   * As per the SMT-LIB standard for strings, we support the first 3 planes of
+   * Unicode characters, where 196608 = 3*16^4.
+   */
+  static inline unsigned num_codes() { return 196608; }
+  /** constructors for String
+   *
+   * Internally, a CVC4::String is represented by a vector of unsigned
+   * integers (d_str) representing the code points of the characters.
+   *
+   * To build a string from a C++ string, we may process escape sequences
+   * according to the SMT-LIB standard. In particular, if useEscSequences is
+   * true, we convert unicode escape sequences:
+   *  \u d_3 d_2 d_1 d_0
+   *  \u{d_0}
+   *  \u{d_1 d_0}
+   *  \u{d_2 d_1 d_0}
+   *  \u{d_3 d_2 d_1 d_0}
+   *  \u{d_4 d_3 d_2 d_1 d_0}
+   * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
+   *
+   * If useEscSequences is false, then the characters of the constructed
+   * CVC4::String correspond one-to-one with the input string.
+   */
+  String() = default;
+  explicit String(const std::string& s, bool useEscSequences = false)
+      : d_str(toInternal(s, useEscSequences))
+  {
+  }
+  explicit String(const char* s, bool useEscSequences = false)
+      : d_str(toInternal(std::string(s), useEscSequences))
+  {
+  }
+  explicit String(const std::vector<unsigned>& s);
+
+  String& operator=(const String& y) {
+    if (this != &y) {
+      d_str = y.d_str;
+    }
+    return *this;
+  }
+
+  String concat(const String& other) const;
+
+  bool operator==(const String& y) const { return cmp(y) == 0; }
+  bool operator!=(const String& y) const { return cmp(y) != 0; }
+  bool operator<(const String& y) const { return cmp(y) < 0; }
+  bool operator>(const String& y) const { return cmp(y) > 0; }
+  bool operator<=(const String& y) const { return cmp(y) <= 0; }
+  bool operator>=(const String& y) const { return cmp(y) >= 0; }
+
+  /**
+   * Returns true if this string is equal to y for their first n characters.
+   * If n is larger than the length of this string or y, this method returns
+   * true if and only if this string is equal to y.
+   */
+  bool strncmp(const String& y, std::size_t n) const;
+  /**
+   * Returns true if this string is equal to y for their last n characters.
+   * Similar to strncmp, if n is larger than the length of this string or y,
+   * this method returns true if and only if this string is equal to y.
+   */
+  bool rstrncmp(const String& y, std::size_t n) const;
+
+  /* toString
+   * Converts this string to a std::string.
+   *
+   * The unprintable characters are converted to unicode escape sequences as
+   * described above.
+   *
+   * If useEscSequences is false, the string's printable characters are
+   * printed as characters. Notice that for all std::string s having only
+   * printable characters, we have that
+   *    CVC4::String( s ).toString() = s.
+   */
+  std::string toString(bool useEscSequences = false) const;
+  /** is this the empty string? */
+  bool empty() const { return d_str.empty(); }
+  /** is this the empty string? */
+  bool isEmptyString() const { return empty(); }
+  /** is less than or equal to string y */
+  bool isLeq(const String& y) const;
+  /** Return the length of the string */
+  std::size_t size() const { return d_str.size(); }
+
+  bool isRepeated() const;
+  bool tailcmp(const String& y, int& c) const;
+
+  /**
+   * Return the first position y occurs in this string, or std::string::npos
+   * otherwise.
+   */
+  std::size_t find(const String& y, const std::size_t start = 0) const;
+  /**
+   * Return the first position y occurs in this string searching from the end,
+   * or std::string::npos otherwise.
+   */
+  std::size_t rfind(const String& y, const std::size_t start = 0) const;
+  /** Returns true if y is a prefix of this */
+  bool hasPrefix(const String& y) const;
+  /** Returns true if y is a suffix of this */
+  bool hasSuffix(const String& y) const;
+  /** Replace the first occurrence of s in this string with t */
+  String replace(const String& s, const String& t) const;
+  /** Return the substring of this string starting at index i */
+  String substr(std::size_t i) const;
+  /** Return the substring of this string starting at index i with size at most
+   * j */
+  String substr(std::size_t i, std::size_t j) const;
+  /** Return the prefix of this string of size at most i */
+  String prefix(std::size_t i) const { return substr(0, i); }
+  /** Return the suffix of this string of size at most i */
+  String suffix(std::size_t i) const { return substr(size() - i, i); }
+
+  /**
+   * Checks if there is any overlap between this string and another string. This
+   * corresponds to checking whether one string contains the other and whether a
+   * substring of one is a prefix of the other and vice-versa.
+   *
+   * @param y The other string
+   * @return True if there is an overlap, false otherwise
+   */
+  bool noOverlapWith(const String& y) const;
+
+  /** string overlap
+  *
+  * if overlap returns m>0,
+  * then the maximal suffix of this string that is a prefix of y is of length m.
+  *
+  * For example, if x is "abcdef", then:
+  * x.overlap("defg") = 3
+  * x.overlap("ab") = 0
+  * x.overlap("d") = 0
+  * x.overlap("bcdefdef") = 5
+  */
+  std::size_t overlap(const String& y) const;
+  /** string reverse overlap
+  *
+  * if roverlap returns m>0,
+  * then the maximal prefix of this string that is a suffix of y is of length m.
+  *
+  * For example, if x is "abcdef", then:
+  * x.roverlap("aaabc") = 3
+  * x.roverlap("def") = 0
+  * x.roverlap("d") = 0
+  * x.roverlap("defabcde") = 5
+  *
+  * Notice that x.overlap(y) = y.roverlap(x)
+  */
+  std::size_t roverlap(const String& y) const;
+
+  /**
+   * Returns true if this string corresponds in text to a number, for example
+   * this returns true for strings "7", "12", "004", "0" and false for strings
+   * "abc", "4a", "-4", "".
+   */
+  bool isNumber() const;
+  /** Returns the corresponding rational for the text of this string. */
+  Rational toNumber() const;
+  /** Get the unsigned representation (code points) of this string */
+  const std::vector<unsigned>& getVec() const { return d_str; }
+  /**
+   * Get the unsigned (code point) value of the first character in this string
+   */
+  unsigned front() const;
+  /**
+   * Get the unsigned (code point) value of the last character in this string
+   */
+  unsigned back() const;
+  /** is the unsigned a digit?
+   *
+   * This is true for code points between 48 ('0') and 57 ('9').
+   */
+  static bool isDigit(unsigned character);
+  /** is the unsigned a hexidecimal digit?
+   *
+   * This is true for code points between 48 ('0') and 57 ('9'), code points
+   * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
+   */
+  static bool isHexDigit(unsigned character);
+  /** is the unsigned a printable code point?
+   *
+   * This is true for Unicode 32 (' ') to 126 ('~').
+   */
+  static bool isPrintable(unsigned character);
+
+  /**
+   * Returns the maximum length of string representable by this class.
+   * Corresponds to the maximum size of d_str.
+   */
+  static size_t maxSize();
+ private:
+  /**
+   * Helper for toInternal: add character ch to vector vec, storing a string in
+   * internal format. This throws an error if ch is not a printable character,
+   * since non-printable characters must be escaped in SMT-LIB.
+   */
+  static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
+  /**
+   * Convert the string s to the internal format (vector of code points).
+   * The argument useEscSequences is whether to process unicode escape
+   * sequences.
+   */
+  static std::vector<unsigned> toInternal(const std::string& s,
+                                          bool useEscSequences);
+
+  /**
+   * Returns a negative number if *this < y, 0 if *this and y are equal and a
+   * positive number if *this > y.
+   */
+  int cmp(const String& y) const;
+
+  std::vector<unsigned> d_str;
+}; /* class String */
+
+namespace strings {
+
+struct CVC4_PUBLIC StringHashFunction {
+  size_t operator()(const ::CVC4::String& s) const {
+    return std::hash<std::string>()(s.toString());
+  }
+}; /* struct StringHashFunction */
+
+}  // namespace strings
+
+std::ostream& operator<<(std::ostream& os, const String& s) CVC4_PUBLIC;
+
+}  // namespace CVC4
+
+#endif /* CVC4__UTIL__STRING_H */
diff --git a/src/util/string.i b/src/util/string.i
new file mode 100644
index 000000000..1ded901aa
--- /dev/null
+++ b/src/util/string.i
@@ -0,0 +1,25 @@
+%{
+#include "util/string.h"
+%}
+
+%rename(CVC4String) String;
+%rename(CVC4StringHashFunction) CVC4::strings::StringHashFunction;
+
+%ignore CVC4::String::String(const std::string&);
+
+%rename(assign) CVC4::String::operator=(const String&);
+%rename(getChar) CVC4::String::operator[](const unsigned int) const;
+%rename(equals) CVC4::String::operator==(const String&) const;
+%ignore CVC4::String::operator!=(const String&) const;
+%rename(less) CVC4::String::operator<(const String&) const;
+%rename(lessEqual) CVC4::String::operator<=(const String&) const;
+%rename(greater) CVC4::String::operator>(const String&) const;
+%rename(greaterEqual) CVC4::String::operator>=(const String&) const;
+
+%rename(apply) CVC4::strings::StringHashFunction::operator()(const ::CVC4::String&) const;
+
+%ignore CVC4::operator<<(std::ostream&, const String&);
+
+%apply int &OUTPUT { int &c };
+%include "util/string.h"
+%clear int &c;
diff --git a/test/unit/theory/theory_strings_skolem_cache_black.h b/test/unit/theory/theory_strings_skolem_cache_black.h
index 34e8d88c6..e1b84492c 100644
--- a/test/unit/theory/theory_strings_skolem_cache_black.h
+++ b/test/unit/theory/theory_strings_skolem_cache_black.h
@@ -20,7 +20,7 @@
 #include "expr/node_manager.h"
 #include "theory/strings/skolem_cache.h"
 #include "util/rational.h"
-#include "util/regexp.h"
+#include "util/string.h"
 
 using namespace CVC4;
 using namespace CVC4::theory::strings;