From: Andrew Reynolds Date: Fri, 27 Mar 2020 19:04:44 +0000 (-0500) Subject: Move string utility file (#4164) X-Git-Tag: cvc5-1.0.0~3437 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4b7fc20dcce9eefdf568937f5e2c54141c4f5c5b;p=cvc5.git Move string utility file (#4164) Moves the string file to string.h. This is required since other required utilities will soon need to be added to regexp.h. --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dd58c74ee..809b00b04 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -961,11 +961,11 @@ install(FILES util/proof.h util/rational_cln_imp.h util/rational_gmp_imp.h - util/regexp.h util/resource_manager.h util/result.h util/sexpr.h util/statistics.h + util/string.h util/tuple.h util/unsafe_interrupt_exception.h ${CMAKE_CURRENT_BINARY_DIR}/util/floatingpoint.h diff --git a/src/cvc4.i b/src/cvc4.i index f9f8f5743..9dcff7f8e 100644 --- a/src/cvc4.i +++ b/src/cvc4.i @@ -274,10 +274,10 @@ std::set CVC4::JavaInputStreamAdapter::s_adapters; %include "util/cardinality.i" %include "util/hash.i" %include "util/proof.i" -%include "util/regexp.i" %include "util/result.i" %include "util/sexpr.i" %include "util/statistics.i" +%include "util/string.i" %include "util/tuple.i" %include "util/unsafe_interrupt_exception.i" diff --git a/src/theory/evaluator.h b/src/theory/evaluator.h index 58e179fbe..b9b15c6c6 100644 --- a/src/theory/evaluator.h +++ b/src/theory/evaluator.h @@ -27,7 +27,7 @@ #include "expr/node.h" #include "util/bitvector.h" #include "util/rational.h" -#include "util/regexp.h" +#include "util/string.h" namespace CVC4 { namespace theory { diff --git a/src/theory/strings/kinds b/src/theory/strings/kinds index 5b988061b..3f7abdb7c 100644 --- a/src/theory/strings/kinds +++ b/src/theory/strings/kinds @@ -37,15 +37,15 @@ operator STRING_REV 1 "string reverse" sort STRING_TYPE \ Cardinality::INTEGERS \ well-founded \ - "NodeManager::currentNM()->mkConst(::CVC4::String())" \ - "util/regexp.h" \ + "NodeManager::currentNM()->mkConst(::CVC4::String())" \ + "util/string.h" \ "String type" sort REGEXP_TYPE \ Cardinality::INTEGERS \ well-founded \ - "NodeManager::currentNM()->mkNode(REGEXP_EMPTY, std::vector() )" \ - "util/regexp.h" \ + "NodeManager::currentNM()->mkNode(REGEXP_EMPTY, std::vector() )" \ + "util/string.h" \ "RegExp type" enumerator STRING_TYPE \ @@ -55,7 +55,7 @@ enumerator STRING_TYPE \ constant CONST_STRING \ ::CVC4::String \ ::CVC4::strings::StringHashFunction \ - "util/regexp.h" \ + "util/string.h" \ "a string of characters" # equal equal / less than / output diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h index b9dbedba5..7845b2e00 100644 --- a/src/theory/strings/regexp_operation.h +++ b/src/theory/strings/regexp_operation.h @@ -24,10 +24,9 @@ #include #include #include "util/hash.h" -#include "util/regexp.h" +#include "util/string.h" #include "theory/theory.h" #include "theory/rewriter.h" -//#include "context/cdhashmap.h" namespace CVC4 { namespace theory { diff --git a/src/theory/strings/regexp_solver.h b/src/theory/strings/regexp_solver.h index 4880af905..d18604752 100644 --- a/src/theory/strings/regexp_solver.h +++ b/src/theory/strings/regexp_solver.h @@ -27,7 +27,7 @@ #include "theory/strings/inference_manager.h" #include "theory/strings/regexp_operation.h" #include "theory/strings/solver_state.h" -#include "util/regexp.h" +#include "util/string.h" namespace CVC4 { namespace theory { diff --git a/src/theory/strings/type_enumerator.cpp b/src/theory/strings/type_enumerator.cpp index 7352ae5de..d24206860 100644 --- a/src/theory/strings/type_enumerator.cpp +++ b/src/theory/strings/type_enumerator.cpp @@ -15,7 +15,7 @@ #include "theory/strings/type_enumerator.h" #include "theory/strings/theory_strings_utils.h" -#include "util/regexp.h" +#include "util/string.h" namespace CVC4 { namespace theory { diff --git a/src/theory/strings/word.cpp b/src/theory/strings/word.cpp index dd573b68c..0faeffd99 100644 --- a/src/theory/strings/word.cpp +++ b/src/theory/strings/word.cpp @@ -14,7 +14,7 @@ #include "theory/strings/word.h" -#include "util/regexp.h" +#include "util/string.h" using namespace CVC4::kind; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 75597edac..6895dc01a 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -25,8 +25,6 @@ libcvc4_add_sources( proof.h random.cpp random.h - regexp.cpp - regexp.h resource_manager.cpp resource_manager.h result.cpp @@ -43,6 +41,8 @@ libcvc4_add_sources( statistics.h statistics_registry.cpp statistics_registry.h + string.cpp + string.h tuple.h unsafe_interrupt_exception.h utility.cpp diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp deleted file mode 100644 index 36ba7182b..000000000 --- a/src/util/regexp.cpp +++ /dev/null @@ -1,488 +0,0 @@ -/********************* */ -/*! \file regexp.cpp - ** \verbatim - ** Top contributors (to current version): - ** Tim King, Tianyi Liang, Andrew Reynolds - ** This file is part of the CVC4 project. - ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS - ** in the top-level source directory) and their institutional affiliations. - ** All rights reserved. See the file COPYING in the top-level source - ** directory for licensing information.\endverbatim - ** - ** \brief [[ Add one-line brief description here ]] - ** - ** [[ Add lengthier description here ]] - ** \todo document this file - **/ - -#include "util/regexp.h" - -#include -#include -#include -#include -#include - -#include "base/check.h" -#include "base/exception.h" - -using namespace std; - -namespace CVC4 { - -static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values."); - -String::String(const std::vector &s) : d_str(s) -{ -#ifdef CVC4_ASSERTIONS - for (unsigned u : d_str) - { - Assert(u < num_codes()); - } -#endif -} - -int String::cmp(const String &y) const { - if (size() != y.size()) { - return size() < y.size() ? -1 : 1; - } - for (unsigned int i = 0; i < size(); ++i) { - if (d_str[i] != y.d_str[i]) { - unsigned cp = d_str[i]; - unsigned cpy = y.d_str[i]; - return cp < cpy ? -1 : 1; - } - } - return 0; -} - -String String::concat(const String &other) const { - std::vector ret_vec(d_str); - ret_vec.insert(ret_vec.end(), other.d_str.begin(), other.d_str.end()); - return String(ret_vec); -} - -bool String::strncmp(const String& y, std::size_t n) const -{ - std::size_t b = (size() >= y.size()) ? size() : y.size(); - std::size_t s = (size() <= y.size()) ? size() : y.size(); - if (n > s) { - if (b == s) { - n = s; - } else { - return false; - } - } - for (std::size_t i = 0; i < n; ++i) { - if (d_str[i] != y.d_str[i]) return false; - } - return true; -} - -bool String::rstrncmp(const String& y, std::size_t n) const -{ - std::size_t b = (size() >= y.size()) ? size() : y.size(); - std::size_t s = (size() <= y.size()) ? size() : y.size(); - if (n > s) { - if (b == s) { - n = s; - } else { - return false; - } - } - for (std::size_t i = 0; i < n; ++i) { - if (d_str[size() - i - 1] != y.d_str[y.size() - i - 1]) return false; - } - return true; -} - -void String::addCharToInternal(unsigned char ch, std::vector& str) -{ - // if not a printable character - if (ch > 127 || ch < 32) - { - std::stringstream serr; - serr << "Illegal string character: \"" << ch - << "\", must use escape sequence"; - throw CVC4::Exception(serr.str()); - } - else - { - str.push_back(static_cast(ch)); - } -} - -std::vector String::toInternal(const std::string& s, - bool useEscSequences) -{ - std::vector str; - unsigned i = 0; - while (i < s.size()) - { - // get the current character - char si = s[i]; - if (si != '\\' || !useEscSequences) - { - addCharToInternal(si, str); - ++i; - continue; - } - // the vector of characters, in case we fail to read an escape sequence - std::vector nonEscCache; - // process the '\' - addCharToInternal(si, nonEscCache); - ++i; - // are we an escape sequence? - bool isEscapeSequence = true; - // the string corresponding to the hexidecimal code point - std::stringstream hexString; - // is the slash followed by a 'u'? Could be last character. - if (i >= s.size() || s[i] != 'u') - { - isEscapeSequence = false; - } - else - { - // process the 'u' - addCharToInternal(s[i], nonEscCache); - ++i; - bool isStart = true; - bool isEnd = false; - bool hasBrace = false; - while (i < s.size()) - { - // add the next character - si = s[i]; - if (isStart) - { - isStart = false; - // possibly read '{' - if (si == '{') - { - hasBrace = true; - addCharToInternal(si, nonEscCache); - ++i; - continue; - } - } - else if (si == '}') - { - // can only end if we had an open brace and read at least one digit - isEscapeSequence = hasBrace && !hexString.str().empty(); - isEnd = true; - addCharToInternal(si, nonEscCache); - ++i; - break; - } - // must be a hex digit at this point - if (!isHexDigit(static_cast(si))) - { - isEscapeSequence = false; - break; - } - hexString << si; - addCharToInternal(si, nonEscCache); - ++i; - if (!hasBrace && hexString.str().size() == 4) - { - // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens - isEnd = true; - break; - } - else if (hasBrace && hexString.str().size() > 5) - { - // too many digits enclosed in brace, not an escape sequence - isEscapeSequence = false; - break; - } - } - if (!isEnd) - { - // if we were interupted before ending, then this is not a valid - // escape sequence - isEscapeSequence = false; - } - } - if (isEscapeSequence) - { - Assert(!hexString.str().empty() && hexString.str().size() <= 5); - // Otherwise, we add the escaped character. - // This is guaranteed not to overflow due to the length of hstr. - uint32_t val; - hexString >> std::hex >> val; - if (val > num_codes()) - { - // Failed due to being out of range. This can happen for strings of - // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not - // in the range [0-2]. - isEscapeSequence = false; - } - else - { - str.push_back(val); - } - } - // if we did not successfully parse an escape sequence, we add back all - // characters that we cached - if (!isEscapeSequence) - { - str.insert(str.end(), nonEscCache.begin(), nonEscCache.end()); - } - } -#ifdef CVC4_ASSERTIONS - for (unsigned u : str) - { - Assert(u < num_codes()); - } -#endif - return str; -} - -unsigned String::front() const -{ - Assert(!d_str.empty()); - return d_str.front(); -} - -unsigned String::back() const -{ - Assert(!d_str.empty()); - return d_str.back(); -} - -std::size_t String::overlap(const String &y) const { - std::size_t i = size() < y.size() ? size() : y.size(); - for (; i > 0; i--) { - String s = suffix(i); - String p = y.prefix(i); - if (s == p) { - return i; - } - } - return i; -} - -std::size_t String::roverlap(const String &y) const { - std::size_t i = size() < y.size() ? size() : y.size(); - for (; i > 0; i--) { - String s = prefix(i); - String p = y.suffix(i); - if (s == p) { - return i; - } - } - return i; -} - -std::string String::toString(bool useEscSequences) const { - std::stringstream str; - for (unsigned int i = 0; i < size(); ++i) { - // we always print forward slash as a code point so that it cannot - // be interpreted as specifying part of a code point, e.g. the string - // '\' + 'u' + '0' of length three. - if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences) - { - str << static_cast(d_str[i]); - } - else - { - std::stringstream ss; - ss << std::hex << d_str[i]; - str << "\\u{" << ss.str() << "}"; - } - } - return str.str(); -} - -bool String::isLeq(const String &y) const -{ - for (unsigned i = 0; i < size(); ++i) - { - if (i >= y.size()) - { - return false; - } - unsigned ci = d_str[i]; - unsigned cyi = y.d_str[i]; - if (ci > cyi) - { - return false; - } - if (ci < cyi) - { - return true; - } - } - return true; -} - -bool String::isRepeated() const { - if (size() > 1) { - unsigned int f = d_str[0]; - for (unsigned i = 1; i < size(); ++i) { - if (f != d_str[i]) return false; - } - } - return true; -} - -bool String::tailcmp(const String &y, int &c) const { - int id_x = size() - 1; - int id_y = y.size() - 1; - while (id_x >= 0 && id_y >= 0) { - if (d_str[id_x] != y.d_str[id_y]) { - c = id_x; - return false; - } - --id_x; - --id_y; - } - c = id_x == -1 ? (-(id_y + 1)) : (id_x + 1); - return true; -} - -std::size_t String::find(const String &y, const std::size_t start) const { - if (size() < y.size() + start) return std::string::npos; - if (y.empty()) return start; - if (empty()) return std::string::npos; - - std::vector::const_iterator itr = std::search( - d_str.begin() + start, d_str.end(), y.d_str.begin(), y.d_str.end()); - if (itr != d_str.end()) { - return itr - d_str.begin(); - } - return std::string::npos; -} - -std::size_t String::rfind(const String &y, const std::size_t start) const { - if (size() < y.size() + start) return std::string::npos; - if (y.empty()) return start; - if (empty()) return std::string::npos; - - std::vector::const_reverse_iterator itr = std::search( - d_str.rbegin() + start, d_str.rend(), y.d_str.rbegin(), y.d_str.rend()); - if (itr != d_str.rend()) { - return itr - d_str.rbegin(); - } - return std::string::npos; -} - -bool String::hasPrefix(const String& y) const -{ - size_t s = size(); - size_t ys = y.size(); - if (ys > s) - { - return false; - } - for (size_t i = 0; i < ys; i++) - { - if (d_str[i] != y.d_str[i]) - { - return false; - } - } - return true; -} - -bool String::hasSuffix(const String& y) const -{ - size_t s = size(); - size_t ys = y.size(); - if (ys > s) - { - return false; - } - size_t idiff = s - ys; - for (size_t i = 0; i < ys; i++) - { - if (d_str[i + idiff] != y.d_str[i]) - { - return false; - } - } - return true; -} - -String String::replace(const String &s, const String &t) const { - std::size_t ret = find(s); - if (ret != std::string::npos) { - std::vector vec; - vec.insert(vec.begin(), d_str.begin(), d_str.begin() + ret); - vec.insert(vec.end(), t.d_str.begin(), t.d_str.end()); - vec.insert(vec.end(), d_str.begin() + ret + s.size(), d_str.end()); - return String(vec); - } else { - return *this; - } -} - -String String::substr(std::size_t i) const { - Assert(i <= size()); - std::vector ret_vec; - std::vector::const_iterator itr = d_str.begin() + i; - ret_vec.insert(ret_vec.end(), itr, d_str.end()); - return String(ret_vec); -} - -String String::substr(std::size_t i, std::size_t j) const { - Assert(i + j <= size()); - std::vector ret_vec; - std::vector::const_iterator itr = d_str.begin() + i; - ret_vec.insert(ret_vec.end(), itr, itr + j); - return String(ret_vec); -} - -bool String::noOverlapWith(const String& y) const -{ - return y.find(*this) == std::string::npos - && this->find(y) == std::string::npos && this->overlap(y) == 0 - && y.overlap(*this) == 0; -} - -bool String::isNumber() const { - if (d_str.empty()) { - return false; - } - for (unsigned character : d_str) { - if (!isDigit(character)) - { - return false; - } - } - return true; -} - -bool String::isDigit(unsigned character) -{ - // '0' to '9' - return 48 <= character && character <= 57; -} - -bool String::isHexDigit(unsigned character) -{ - // '0' to '9' or 'A' to 'F' or 'a' to 'f' - return isDigit(character) || (65 <= character && character <= 70) - || (97 <= character && character <= 102); -} - -bool String::isPrintable(unsigned character) -{ - // Unicode 0x00020 (' ') to 0x0007E ('~') - return 32 <= character && character <= 126; -} - -size_t String::maxSize() { return std::numeric_limits::max(); } - -Rational String::toNumber() const -{ - // when smt2 standard for strings is set, this may change, based on the - // semantics of str.from.int for leading zeros - return Rational(toString()); -} - -std::ostream &operator<<(std::ostream &os, const String &s) { - return os << "\"" << s.toString(true) << "\""; -} - -} // namespace CVC4 diff --git a/src/util/regexp.h b/src/util/regexp.h deleted file mode 100644 index 56fb969a3..000000000 --- a/src/util/regexp.h +++ /dev/null @@ -1,274 +0,0 @@ -/********************* */ -/*! \file regexp.h - ** \verbatim - ** Top contributors (to current version): - ** Andrew Reynolds, Tim King, Tianyi Liang - ** This file is part of the CVC4 project. - ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS - ** in the top-level source directory) and their institutional affiliations. - ** All rights reserved. See the file COPYING in the top-level source - ** directory for licensing information.\endverbatim - ** - ** \brief [[ Add one-line brief description here ]] - ** - ** [[ Add lengthier description here ]] - ** \todo document this file - **/ - -#include "cvc4_public.h" - -#ifndef CVC4__REGEXP_H -#define CVC4__REGEXP_H - -#include -#include -#include -#include -#include -#include "util/rational.h" - -namespace CVC4 { - -/** The CVC4 string class - * - * This data structure is the domain of values for the string type. It can also - * be used as a generic utility for representing strings. - */ -class CVC4_PUBLIC String { - public: - /** - * This is the cardinality of the alphabet that is representable by this - * class. Notice that this must be greater than or equal to the cardinality - * of the alphabet that the string theory reasons about. - * - * This must be strictly less than std::numeric_limits::max(). - * - * As per the SMT-LIB standard for strings, we support the first 3 planes of - * Unicode characters, where 196608 = 3*16^4. - */ - static inline unsigned num_codes() { return 196608; } - /** constructors for String - * - * Internally, a CVC4::String is represented by a vector of unsigned - * integers (d_str) representing the code points of the characters. - * - * To build a string from a C++ string, we may process escape sequences - * according to the SMT-LIB standard. In particular, if useEscSequences is - * true, we convert unicode escape sequences: - * \u d_3 d_2 d_1 d_0 - * \u{d_0} - * \u{d_1 d_0} - * \u{d_2 d_1 d_0} - * \u{d_3 d_2 d_1 d_0} - * \u{d_4 d_3 d_2 d_1 d_0} - * where d_0 ... d_4 are hexidecimal digits, to the appropriate character. - * - * If useEscSequences is false, then the characters of the constructed - * CVC4::String correspond one-to-one with the input string. - */ - String() = default; - explicit String(const std::string& s, bool useEscSequences = false) - : d_str(toInternal(s, useEscSequences)) - { - } - explicit String(const char* s, bool useEscSequences = false) - : d_str(toInternal(std::string(s), useEscSequences)) - { - } - explicit String(const std::vector& s); - - String& operator=(const String& y) { - if (this != &y) { - d_str = y.d_str; - } - return *this; - } - - String concat(const String& other) const; - - bool operator==(const String& y) const { return cmp(y) == 0; } - bool operator!=(const String& y) const { return cmp(y) != 0; } - bool operator<(const String& y) const { return cmp(y) < 0; } - bool operator>(const String& y) const { return cmp(y) > 0; } - bool operator<=(const String& y) const { return cmp(y) <= 0; } - bool operator>=(const String& y) const { return cmp(y) >= 0; } - - /** - * Returns true if this string is equal to y for their first n characters. - * If n is larger than the length of this string or y, this method returns - * true if and only if this string is equal to y. - */ - bool strncmp(const String& y, std::size_t n) const; - /** - * Returns true if this string is equal to y for their last n characters. - * Similar to strncmp, if n is larger than the length of this string or y, - * this method returns true if and only if this string is equal to y. - */ - bool rstrncmp(const String& y, std::size_t n) const; - - /* toString - * Converts this string to a std::string. - * - * The unprintable characters are converted to unicode escape sequences as - * described above. - * - * If useEscSequences is false, the string's printable characters are - * printed as characters. Notice that for all std::string s having only - * printable characters, we have that - * CVC4::String( s ).toString() = s. - */ - std::string toString(bool useEscSequences = false) const; - /** is this the empty string? */ - bool empty() const { return d_str.empty(); } - /** is this the empty string? */ - bool isEmptyString() const { return empty(); } - /** is less than or equal to string y */ - bool isLeq(const String& y) const; - /** Return the length of the string */ - std::size_t size() const { return d_str.size(); } - - bool isRepeated() const; - bool tailcmp(const String& y, int& c) const; - - /** - * Return the first position y occurs in this string, or std::string::npos - * otherwise. - */ - std::size_t find(const String& y, const std::size_t start = 0) const; - /** - * Return the first position y occurs in this string searching from the end, - * or std::string::npos otherwise. - */ - std::size_t rfind(const String& y, const std::size_t start = 0) const; - /** Returns true if y is a prefix of this */ - bool hasPrefix(const String& y) const; - /** Returns true if y is a suffix of this */ - bool hasSuffix(const String& y) const; - /** Replace the first occurrence of s in this string with t */ - String replace(const String& s, const String& t) const; - /** Return the substring of this string starting at index i */ - String substr(std::size_t i) const; - /** Return the substring of this string starting at index i with size at most - * j */ - String substr(std::size_t i, std::size_t j) const; - /** Return the prefix of this string of size at most i */ - String prefix(std::size_t i) const { return substr(0, i); } - /** Return the suffix of this string of size at most i */ - String suffix(std::size_t i) const { return substr(size() - i, i); } - - /** - * Checks if there is any overlap between this string and another string. This - * corresponds to checking whether one string contains the other and whether a - * substring of one is a prefix of the other and vice-versa. - * - * @param y The other string - * @return True if there is an overlap, false otherwise - */ - bool noOverlapWith(const String& y) const; - - /** string overlap - * - * if overlap returns m>0, - * then the maximal suffix of this string that is a prefix of y is of length m. - * - * For example, if x is "abcdef", then: - * x.overlap("defg") = 3 - * x.overlap("ab") = 0 - * x.overlap("d") = 0 - * x.overlap("bcdefdef") = 5 - */ - std::size_t overlap(const String& y) const; - /** string reverse overlap - * - * if roverlap returns m>0, - * then the maximal prefix of this string that is a suffix of y is of length m. - * - * For example, if x is "abcdef", then: - * x.roverlap("aaabc") = 3 - * x.roverlap("def") = 0 - * x.roverlap("d") = 0 - * x.roverlap("defabcde") = 5 - * - * Notice that x.overlap(y) = y.roverlap(x) - */ - std::size_t roverlap(const String& y) const; - - /** - * Returns true if this string corresponds in text to a number, for example - * this returns true for strings "7", "12", "004", "0" and false for strings - * "abc", "4a", "-4", "". - */ - bool isNumber() const; - /** Returns the corresponding rational for the text of this string. */ - Rational toNumber() const; - /** Get the unsigned representation (code points) of this string */ - const std::vector& getVec() const { return d_str; } - /** - * Get the unsigned (code point) value of the first character in this string - */ - unsigned front() const; - /** - * Get the unsigned (code point) value of the last character in this string - */ - unsigned back() const; - /** is the unsigned a digit? - * - * This is true for code points between 48 ('0') and 57 ('9'). - */ - static bool isDigit(unsigned character); - /** is the unsigned a hexidecimal digit? - * - * This is true for code points between 48 ('0') and 57 ('9'), code points - * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f). - */ - static bool isHexDigit(unsigned character); - /** is the unsigned a printable code point? - * - * This is true for Unicode 32 (' ') to 126 ('~'). - */ - static bool isPrintable(unsigned character); - - /** - * Returns the maximum length of string representable by this class. - * Corresponds to the maximum size of d_str. - */ - static size_t maxSize(); - private: - /** - * Helper for toInternal: add character ch to vector vec, storing a string in - * internal format. This throws an error if ch is not a printable character, - * since non-printable characters must be escaped in SMT-LIB. - */ - static void addCharToInternal(unsigned char ch, std::vector& vec); - /** - * Convert the string s to the internal format (vector of code points). - * The argument useEscSequences is whether to process unicode escape - * sequences. - */ - static std::vector toInternal(const std::string& s, - bool useEscSequences); - - /** - * Returns a negative number if *this < y, 0 if *this and y are equal and a - * positive number if *this > y. - */ - int cmp(const String& y) const; - - std::vector d_str; -}; /* class String */ - -namespace strings { - -struct CVC4_PUBLIC StringHashFunction { - size_t operator()(const ::CVC4::String& s) const { - return std::hash()(s.toString()); - } -}; /* struct StringHashFunction */ - -} // namespace strings - -std::ostream& operator<<(std::ostream& os, const String& s) CVC4_PUBLIC; - -} // namespace CVC4 - -#endif /* CVC4__REGEXP_H */ diff --git a/src/util/regexp.i b/src/util/regexp.i deleted file mode 100644 index afc51abd7..000000000 --- a/src/util/regexp.i +++ /dev/null @@ -1,25 +0,0 @@ -%{ -#include "util/regexp.h" -%} - -%rename(CVC4String) String; -%rename(CVC4StringHashFunction) CVC4::strings::StringHashFunction; - -%ignore CVC4::String::String(const std::string&); - -%rename(assign) CVC4::String::operator=(const String&); -%rename(getChar) CVC4::String::operator[](const unsigned int) const; -%rename(equals) CVC4::String::operator==(const String&) const; -%ignore CVC4::String::operator!=(const String&) const; -%rename(less) CVC4::String::operator<(const String&) const; -%rename(lessEqual) CVC4::String::operator<=(const String&) const; -%rename(greater) CVC4::String::operator>(const String&) const; -%rename(greaterEqual) CVC4::String::operator>=(const String&) const; - -%rename(apply) CVC4::strings::StringHashFunction::operator()(const ::CVC4::String&) const; - -%ignore CVC4::operator<<(std::ostream&, const String&); - -%apply int &OUTPUT { int &c }; -%include "util/regexp.h" -%clear int &c; diff --git a/src/util/string.cpp b/src/util/string.cpp new file mode 100644 index 000000000..ff522ba7b --- /dev/null +++ b/src/util/string.cpp @@ -0,0 +1,485 @@ +/********************* */ +/*! \file string.cpp + ** \verbatim + ** Top contributors (to current version): + ** Tim King, Tianyi Liang, Andrew Reynolds + ** This file is part of the CVC4 project. + ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS + ** in the top-level source directory) and their institutional affiliations. + ** All rights reserved. See the file COPYING in the top-level source + ** directory for licensing information.\endverbatim + ** + ** \brief Implementation of the string data type. + **/ + +#include "util/string.h" + +#include +#include +#include +#include +#include + +#include "base/check.h" +#include "base/exception.h" + +using namespace std; + +namespace CVC4 { + +static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values."); + +String::String(const std::vector &s) : d_str(s) +{ +#ifdef CVC4_ASSERTIONS + for (unsigned u : d_str) + { + Assert(u < num_codes()); + } +#endif +} + +int String::cmp(const String &y) const { + if (size() != y.size()) { + return size() < y.size() ? -1 : 1; + } + for (unsigned int i = 0; i < size(); ++i) { + if (d_str[i] != y.d_str[i]) { + unsigned cp = d_str[i]; + unsigned cpy = y.d_str[i]; + return cp < cpy ? -1 : 1; + } + } + return 0; +} + +String String::concat(const String &other) const { + std::vector ret_vec(d_str); + ret_vec.insert(ret_vec.end(), other.d_str.begin(), other.d_str.end()); + return String(ret_vec); +} + +bool String::strncmp(const String& y, std::size_t n) const +{ + std::size_t b = (size() >= y.size()) ? size() : y.size(); + std::size_t s = (size() <= y.size()) ? size() : y.size(); + if (n > s) { + if (b == s) { + n = s; + } else { + return false; + } + } + for (std::size_t i = 0; i < n; ++i) { + if (d_str[i] != y.d_str[i]) return false; + } + return true; +} + +bool String::rstrncmp(const String& y, std::size_t n) const +{ + std::size_t b = (size() >= y.size()) ? size() : y.size(); + std::size_t s = (size() <= y.size()) ? size() : y.size(); + if (n > s) { + if (b == s) { + n = s; + } else { + return false; + } + } + for (std::size_t i = 0; i < n; ++i) { + if (d_str[size() - i - 1] != y.d_str[y.size() - i - 1]) return false; + } + return true; +} + +void String::addCharToInternal(unsigned char ch, std::vector& str) +{ + // if not a printable character + if (ch > 127 || ch < 32) + { + std::stringstream serr; + serr << "Illegal string character: \"" << ch + << "\", must use escape sequence"; + throw CVC4::Exception(serr.str()); + } + else + { + str.push_back(static_cast(ch)); + } +} + +std::vector String::toInternal(const std::string& s, + bool useEscSequences) +{ + std::vector str; + unsigned i = 0; + while (i < s.size()) + { + // get the current character + char si = s[i]; + if (si != '\\' || !useEscSequences) + { + addCharToInternal(si, str); + ++i; + continue; + } + // the vector of characters, in case we fail to read an escape sequence + std::vector nonEscCache; + // process the '\' + addCharToInternal(si, nonEscCache); + ++i; + // are we an escape sequence? + bool isEscapeSequence = true; + // the string corresponding to the hexidecimal code point + std::stringstream hexString; + // is the slash followed by a 'u'? Could be last character. + if (i >= s.size() || s[i] != 'u') + { + isEscapeSequence = false; + } + else + { + // process the 'u' + addCharToInternal(s[i], nonEscCache); + ++i; + bool isStart = true; + bool isEnd = false; + bool hasBrace = false; + while (i < s.size()) + { + // add the next character + si = s[i]; + if (isStart) + { + isStart = false; + // possibly read '{' + if (si == '{') + { + hasBrace = true; + addCharToInternal(si, nonEscCache); + ++i; + continue; + } + } + else if (si == '}') + { + // can only end if we had an open brace and read at least one digit + isEscapeSequence = hasBrace && !hexString.str().empty(); + isEnd = true; + addCharToInternal(si, nonEscCache); + ++i; + break; + } + // must be a hex digit at this point + if (!isHexDigit(static_cast(si))) + { + isEscapeSequence = false; + break; + } + hexString << si; + addCharToInternal(si, nonEscCache); + ++i; + if (!hasBrace && hexString.str().size() == 4) + { + // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens + isEnd = true; + break; + } + else if (hasBrace && hexString.str().size() > 5) + { + // too many digits enclosed in brace, not an escape sequence + isEscapeSequence = false; + break; + } + } + if (!isEnd) + { + // if we were interupted before ending, then this is not a valid + // escape sequence + isEscapeSequence = false; + } + } + if (isEscapeSequence) + { + Assert(!hexString.str().empty() && hexString.str().size() <= 5); + // Otherwise, we add the escaped character. + // This is guaranteed not to overflow due to the length of hstr. + uint32_t val; + hexString >> std::hex >> val; + if (val > num_codes()) + { + // Failed due to being out of range. This can happen for strings of + // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not + // in the range [0-2]. + isEscapeSequence = false; + } + else + { + str.push_back(val); + } + } + // if we did not successfully parse an escape sequence, we add back all + // characters that we cached + if (!isEscapeSequence) + { + str.insert(str.end(), nonEscCache.begin(), nonEscCache.end()); + } + } +#ifdef CVC4_ASSERTIONS + for (unsigned u : str) + { + Assert(u < num_codes()); + } +#endif + return str; +} + +unsigned String::front() const +{ + Assert(!d_str.empty()); + return d_str.front(); +} + +unsigned String::back() const +{ + Assert(!d_str.empty()); + return d_str.back(); +} + +std::size_t String::overlap(const String &y) const { + std::size_t i = size() < y.size() ? size() : y.size(); + for (; i > 0; i--) { + String s = suffix(i); + String p = y.prefix(i); + if (s == p) { + return i; + } + } + return i; +} + +std::size_t String::roverlap(const String &y) const { + std::size_t i = size() < y.size() ? size() : y.size(); + for (; i > 0; i--) { + String s = prefix(i); + String p = y.suffix(i); + if (s == p) { + return i; + } + } + return i; +} + +std::string String::toString(bool useEscSequences) const { + std::stringstream str; + for (unsigned int i = 0; i < size(); ++i) { + // we always print forward slash as a code point so that it cannot + // be interpreted as specifying part of a code point, e.g. the string + // '\' + 'u' + '0' of length three. + if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences) + { + str << static_cast(d_str[i]); + } + else + { + std::stringstream ss; + ss << std::hex << d_str[i]; + str << "\\u{" << ss.str() << "}"; + } + } + return str.str(); +} + +bool String::isLeq(const String &y) const +{ + for (unsigned i = 0; i < size(); ++i) + { + if (i >= y.size()) + { + return false; + } + unsigned ci = d_str[i]; + unsigned cyi = y.d_str[i]; + if (ci > cyi) + { + return false; + } + if (ci < cyi) + { + return true; + } + } + return true; +} + +bool String::isRepeated() const { + if (size() > 1) { + unsigned int f = d_str[0]; + for (unsigned i = 1; i < size(); ++i) { + if (f != d_str[i]) return false; + } + } + return true; +} + +bool String::tailcmp(const String &y, int &c) const { + int id_x = size() - 1; + int id_y = y.size() - 1; + while (id_x >= 0 && id_y >= 0) { + if (d_str[id_x] != y.d_str[id_y]) { + c = id_x; + return false; + } + --id_x; + --id_y; + } + c = id_x == -1 ? (-(id_y + 1)) : (id_x + 1); + return true; +} + +std::size_t String::find(const String &y, const std::size_t start) const { + if (size() < y.size() + start) return std::string::npos; + if (y.empty()) return start; + if (empty()) return std::string::npos; + + std::vector::const_iterator itr = std::search( + d_str.begin() + start, d_str.end(), y.d_str.begin(), y.d_str.end()); + if (itr != d_str.end()) { + return itr - d_str.begin(); + } + return std::string::npos; +} + +std::size_t String::rfind(const String &y, const std::size_t start) const { + if (size() < y.size() + start) return std::string::npos; + if (y.empty()) return start; + if (empty()) return std::string::npos; + + std::vector::const_reverse_iterator itr = std::search( + d_str.rbegin() + start, d_str.rend(), y.d_str.rbegin(), y.d_str.rend()); + if (itr != d_str.rend()) { + return itr - d_str.rbegin(); + } + return std::string::npos; +} + +bool String::hasPrefix(const String& y) const +{ + size_t s = size(); + size_t ys = y.size(); + if (ys > s) + { + return false; + } + for (size_t i = 0; i < ys; i++) + { + if (d_str[i] != y.d_str[i]) + { + return false; + } + } + return true; +} + +bool String::hasSuffix(const String& y) const +{ + size_t s = size(); + size_t ys = y.size(); + if (ys > s) + { + return false; + } + size_t idiff = s - ys; + for (size_t i = 0; i < ys; i++) + { + if (d_str[i + idiff] != y.d_str[i]) + { + return false; + } + } + return true; +} + +String String::replace(const String &s, const String &t) const { + std::size_t ret = find(s); + if (ret != std::string::npos) { + std::vector vec; + vec.insert(vec.begin(), d_str.begin(), d_str.begin() + ret); + vec.insert(vec.end(), t.d_str.begin(), t.d_str.end()); + vec.insert(vec.end(), d_str.begin() + ret + s.size(), d_str.end()); + return String(vec); + } else { + return *this; + } +} + +String String::substr(std::size_t i) const { + Assert(i <= size()); + std::vector ret_vec; + std::vector::const_iterator itr = d_str.begin() + i; + ret_vec.insert(ret_vec.end(), itr, d_str.end()); + return String(ret_vec); +} + +String String::substr(std::size_t i, std::size_t j) const { + Assert(i + j <= size()); + std::vector ret_vec; + std::vector::const_iterator itr = d_str.begin() + i; + ret_vec.insert(ret_vec.end(), itr, itr + j); + return String(ret_vec); +} + +bool String::noOverlapWith(const String& y) const +{ + return y.find(*this) == std::string::npos + && this->find(y) == std::string::npos && this->overlap(y) == 0 + && y.overlap(*this) == 0; +} + +bool String::isNumber() const { + if (d_str.empty()) { + return false; + } + for (unsigned character : d_str) { + if (!isDigit(character)) + { + return false; + } + } + return true; +} + +bool String::isDigit(unsigned character) +{ + // '0' to '9' + return 48 <= character && character <= 57; +} + +bool String::isHexDigit(unsigned character) +{ + // '0' to '9' or 'A' to 'F' or 'a' to 'f' + return isDigit(character) || (65 <= character && character <= 70) + || (97 <= character && character <= 102); +} + +bool String::isPrintable(unsigned character) +{ + // Unicode 0x00020 (' ') to 0x0007E ('~') + return 32 <= character && character <= 126; +} + +size_t String::maxSize() { return std::numeric_limits::max(); } + +Rational String::toNumber() const +{ + // when smt2 standard for strings is set, this may change, based on the + // semantics of str.from.int for leading zeros + return Rational(toString()); +} + +std::ostream &operator<<(std::ostream &os, const String &s) { + return os << "\"" << s.toString(true) << "\""; +} + +} // namespace CVC4 diff --git a/src/util/string.h b/src/util/string.h new file mode 100644 index 000000000..032105812 --- /dev/null +++ b/src/util/string.h @@ -0,0 +1,271 @@ +/********************* */ +/*! \file string.h + ** \verbatim + ** Top contributors (to current version): + ** Andrew Reynolds, Tim King, Tianyi Liang + ** This file is part of the CVC4 project. + ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS + ** in the top-level source directory) and their institutional affiliations. + ** All rights reserved. See the file COPYING in the top-level source + ** directory for licensing information.\endverbatim + ** + ** \brief The string data type. + **/ + +#include "cvc4_public.h" + +#ifndef CVC4__UTIL__STRING_H +#define CVC4__UTIL__STRING_H + +#include +#include +#include +#include +#include +#include "util/rational.h" + +namespace CVC4 { + +/** The CVC4 string class + * + * This data structure is the domain of values for the string type. It can also + * be used as a generic utility for representing strings. + */ +class CVC4_PUBLIC String { + public: + /** + * This is the cardinality of the alphabet that is representable by this + * class. Notice that this must be greater than or equal to the cardinality + * of the alphabet that the string theory reasons about. + * + * This must be strictly less than std::numeric_limits::max(). + * + * As per the SMT-LIB standard for strings, we support the first 3 planes of + * Unicode characters, where 196608 = 3*16^4. + */ + static inline unsigned num_codes() { return 196608; } + /** constructors for String + * + * Internally, a CVC4::String is represented by a vector of unsigned + * integers (d_str) representing the code points of the characters. + * + * To build a string from a C++ string, we may process escape sequences + * according to the SMT-LIB standard. In particular, if useEscSequences is + * true, we convert unicode escape sequences: + * \u d_3 d_2 d_1 d_0 + * \u{d_0} + * \u{d_1 d_0} + * \u{d_2 d_1 d_0} + * \u{d_3 d_2 d_1 d_0} + * \u{d_4 d_3 d_2 d_1 d_0} + * where d_0 ... d_4 are hexidecimal digits, to the appropriate character. + * + * If useEscSequences is false, then the characters of the constructed + * CVC4::String correspond one-to-one with the input string. + */ + String() = default; + explicit String(const std::string& s, bool useEscSequences = false) + : d_str(toInternal(s, useEscSequences)) + { + } + explicit String(const char* s, bool useEscSequences = false) + : d_str(toInternal(std::string(s), useEscSequences)) + { + } + explicit String(const std::vector& s); + + String& operator=(const String& y) { + if (this != &y) { + d_str = y.d_str; + } + return *this; + } + + String concat(const String& other) const; + + bool operator==(const String& y) const { return cmp(y) == 0; } + bool operator!=(const String& y) const { return cmp(y) != 0; } + bool operator<(const String& y) const { return cmp(y) < 0; } + bool operator>(const String& y) const { return cmp(y) > 0; } + bool operator<=(const String& y) const { return cmp(y) <= 0; } + bool operator>=(const String& y) const { return cmp(y) >= 0; } + + /** + * Returns true if this string is equal to y for their first n characters. + * If n is larger than the length of this string or y, this method returns + * true if and only if this string is equal to y. + */ + bool strncmp(const String& y, std::size_t n) const; + /** + * Returns true if this string is equal to y for their last n characters. + * Similar to strncmp, if n is larger than the length of this string or y, + * this method returns true if and only if this string is equal to y. + */ + bool rstrncmp(const String& y, std::size_t n) const; + + /* toString + * Converts this string to a std::string. + * + * The unprintable characters are converted to unicode escape sequences as + * described above. + * + * If useEscSequences is false, the string's printable characters are + * printed as characters. Notice that for all std::string s having only + * printable characters, we have that + * CVC4::String( s ).toString() = s. + */ + std::string toString(bool useEscSequences = false) const; + /** is this the empty string? */ + bool empty() const { return d_str.empty(); } + /** is this the empty string? */ + bool isEmptyString() const { return empty(); } + /** is less than or equal to string y */ + bool isLeq(const String& y) const; + /** Return the length of the string */ + std::size_t size() const { return d_str.size(); } + + bool isRepeated() const; + bool tailcmp(const String& y, int& c) const; + + /** + * Return the first position y occurs in this string, or std::string::npos + * otherwise. + */ + std::size_t find(const String& y, const std::size_t start = 0) const; + /** + * Return the first position y occurs in this string searching from the end, + * or std::string::npos otherwise. + */ + std::size_t rfind(const String& y, const std::size_t start = 0) const; + /** Returns true if y is a prefix of this */ + bool hasPrefix(const String& y) const; + /** Returns true if y is a suffix of this */ + bool hasSuffix(const String& y) const; + /** Replace the first occurrence of s in this string with t */ + String replace(const String& s, const String& t) const; + /** Return the substring of this string starting at index i */ + String substr(std::size_t i) const; + /** Return the substring of this string starting at index i with size at most + * j */ + String substr(std::size_t i, std::size_t j) const; + /** Return the prefix of this string of size at most i */ + String prefix(std::size_t i) const { return substr(0, i); } + /** Return the suffix of this string of size at most i */ + String suffix(std::size_t i) const { return substr(size() - i, i); } + + /** + * Checks if there is any overlap between this string and another string. This + * corresponds to checking whether one string contains the other and whether a + * substring of one is a prefix of the other and vice-versa. + * + * @param y The other string + * @return True if there is an overlap, false otherwise + */ + bool noOverlapWith(const String& y) const; + + /** string overlap + * + * if overlap returns m>0, + * then the maximal suffix of this string that is a prefix of y is of length m. + * + * For example, if x is "abcdef", then: + * x.overlap("defg") = 3 + * x.overlap("ab") = 0 + * x.overlap("d") = 0 + * x.overlap("bcdefdef") = 5 + */ + std::size_t overlap(const String& y) const; + /** string reverse overlap + * + * if roverlap returns m>0, + * then the maximal prefix of this string that is a suffix of y is of length m. + * + * For example, if x is "abcdef", then: + * x.roverlap("aaabc") = 3 + * x.roverlap("def") = 0 + * x.roverlap("d") = 0 + * x.roverlap("defabcde") = 5 + * + * Notice that x.overlap(y) = y.roverlap(x) + */ + std::size_t roverlap(const String& y) const; + + /** + * Returns true if this string corresponds in text to a number, for example + * this returns true for strings "7", "12", "004", "0" and false for strings + * "abc", "4a", "-4", "". + */ + bool isNumber() const; + /** Returns the corresponding rational for the text of this string. */ + Rational toNumber() const; + /** Get the unsigned representation (code points) of this string */ + const std::vector& getVec() const { return d_str; } + /** + * Get the unsigned (code point) value of the first character in this string + */ + unsigned front() const; + /** + * Get the unsigned (code point) value of the last character in this string + */ + unsigned back() const; + /** is the unsigned a digit? + * + * This is true for code points between 48 ('0') and 57 ('9'). + */ + static bool isDigit(unsigned character); + /** is the unsigned a hexidecimal digit? + * + * This is true for code points between 48 ('0') and 57 ('9'), code points + * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f). + */ + static bool isHexDigit(unsigned character); + /** is the unsigned a printable code point? + * + * This is true for Unicode 32 (' ') to 126 ('~'). + */ + static bool isPrintable(unsigned character); + + /** + * Returns the maximum length of string representable by this class. + * Corresponds to the maximum size of d_str. + */ + static size_t maxSize(); + private: + /** + * Helper for toInternal: add character ch to vector vec, storing a string in + * internal format. This throws an error if ch is not a printable character, + * since non-printable characters must be escaped in SMT-LIB. + */ + static void addCharToInternal(unsigned char ch, std::vector& vec); + /** + * Convert the string s to the internal format (vector of code points). + * The argument useEscSequences is whether to process unicode escape + * sequences. + */ + static std::vector toInternal(const std::string& s, + bool useEscSequences); + + /** + * Returns a negative number if *this < y, 0 if *this and y are equal and a + * positive number if *this > y. + */ + int cmp(const String& y) const; + + std::vector d_str; +}; /* class String */ + +namespace strings { + +struct CVC4_PUBLIC StringHashFunction { + size_t operator()(const ::CVC4::String& s) const { + return std::hash()(s.toString()); + } +}; /* struct StringHashFunction */ + +} // namespace strings + +std::ostream& operator<<(std::ostream& os, const String& s) CVC4_PUBLIC; + +} // namespace CVC4 + +#endif /* CVC4__UTIL__STRING_H */ diff --git a/src/util/string.i b/src/util/string.i new file mode 100644 index 000000000..1ded901aa --- /dev/null +++ b/src/util/string.i @@ -0,0 +1,25 @@ +%{ +#include "util/string.h" +%} + +%rename(CVC4String) String; +%rename(CVC4StringHashFunction) CVC4::strings::StringHashFunction; + +%ignore CVC4::String::String(const std::string&); + +%rename(assign) CVC4::String::operator=(const String&); +%rename(getChar) CVC4::String::operator[](const unsigned int) const; +%rename(equals) CVC4::String::operator==(const String&) const; +%ignore CVC4::String::operator!=(const String&) const; +%rename(less) CVC4::String::operator<(const String&) const; +%rename(lessEqual) CVC4::String::operator<=(const String&) const; +%rename(greater) CVC4::String::operator>(const String&) const; +%rename(greaterEqual) CVC4::String::operator>=(const String&) const; + +%rename(apply) CVC4::strings::StringHashFunction::operator()(const ::CVC4::String&) const; + +%ignore CVC4::operator<<(std::ostream&, const String&); + +%apply int &OUTPUT { int &c }; +%include "util/string.h" +%clear int &c; diff --git a/test/unit/theory/theory_strings_skolem_cache_black.h b/test/unit/theory/theory_strings_skolem_cache_black.h index 34e8d88c6..e1b84492c 100644 --- a/test/unit/theory/theory_strings_skolem_cache_black.h +++ b/test/unit/theory/theory_strings_skolem_cache_black.h @@ -20,7 +20,7 @@ #include "expr/node_manager.h" #include "theory/strings/skolem_cache.h" #include "util/rational.h" -#include "util/regexp.h" +#include "util/string.h" using namespace CVC4; using namespace CVC4::theory::strings;