From b0fa6b29a1e15b231547eab147b49f2883a139de Mon Sep 17 00:00:00 2001 From: Andrew Reynolds Date: Mon, 24 Feb 2020 16:21:45 -0600 Subject: [PATCH] Utilities for words (#3797) This adds a utility file for handling operations over constant "words" (this will eventually generalize string or sequence constants). This is work towards CVC4/cvc4-projects#23. Also improves documentation in regexp.h. --- src/CMakeLists.txt | 2 + src/theory/strings/word.cpp | 227 +++++++++++++++++++ src/theory/strings/word.h | 121 ++++++++++ src/util/regexp.cpp | 7 + src/util/regexp.h | 25 +- test/unit/theory/CMakeLists.txt | 1 + test/unit/theory/theory_strings_word_white.h | 149 ++++++++++++ 7 files changed, 523 insertions(+), 9 deletions(-) create mode 100644 src/theory/strings/word.cpp create mode 100644 src/theory/strings/word.h create mode 100644 test/unit/theory/theory_strings_word_white.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45912a30d..9a81ccc63 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -702,6 +702,8 @@ libcvc4_add_sources( theory/strings/theory_strings_utils.cpp theory/strings/theory_strings_utils.h theory/strings/type_enumerator.h + theory/strings/word.cpp + theory/strings/word.h theory/subs_minimize.cpp theory/subs_minimize.h theory/substitutions.cpp diff --git a/src/theory/strings/word.cpp b/src/theory/strings/word.cpp new file mode 100644 index 000000000..655df7da3 --- /dev/null +++ b/src/theory/strings/word.cpp @@ -0,0 +1,227 @@ +/********************* */ +/*! \file word.cpp + ** \verbatim + ** Top contributors (to current version): + ** Andrew Reynolds + ** This file is part of the CVC4 project. + ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS + ** in the top-level source directory) and their institutional affiliations. + ** All rights reserved. See the file COPYING in the top-level source + ** directory for licensing information.\endverbatim + ** + ** \brief Implementation of utility functions for words. + **/ + +#include "theory/strings/word.h" + +#include "util/regexp.h" + +using namespace CVC4::kind; + +namespace CVC4 { +namespace theory { +namespace strings { + +Node Word::mkEmptyWord(Kind k) +{ + NodeManager* nm = NodeManager::currentNM(); + if (k == CONST_STRING) + { + std::vector vec; + return nm->mkConst(String(vec)); + } + Unimplemented(); + return Node::null(); +} + +Node Word::mkWord(const std::vector& xs) +{ + Assert(!xs.empty()); + NodeManager* nm = NodeManager::currentNM(); + Kind k = xs[0].getKind(); + if (k == CONST_STRING) + { + std::vector vec; + for (TNode x : xs) + { + Assert(x.getKind() == CONST_STRING); + String sx = x.getConst(); + const std::vector& vecc = sx.getVec(); + vec.insert(vec.end(), vecc.begin(), vecc.end()); + } + return nm->mkConst(String(vec)); + } + Unimplemented(); + return Node::null(); +} + +size_t Word::getLength(TNode x) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + return x.getConst().size(); + } + Unimplemented(); + return 0; +} + +bool Word::isEmpty(TNode x) { return getLength(x) == 0; } + +std::size_t Word::find(TNode x, TNode y, std::size_t start) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + return sx.find(sy, start); + } + Unimplemented(); + return 0; +} + +std::size_t Word::rfind(TNode x, TNode y, std::size_t start) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + return sx.rfind(sy, start); + } + Unimplemented(); + return 0; +} + +bool Word::hasPrefix(TNode x, TNode y) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + return sx.hasPrefix(sy); + } + Unimplemented(); + return false; +} + +bool Word::hasSuffix(TNode x, TNode y) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + return sx.hasSuffix(sy); + } + Unimplemented(); + return false; +} + +Node Word::replace(TNode x, TNode y, TNode t) +{ + NodeManager* nm = NodeManager::currentNM(); + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + Assert(t.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + String st = t.getConst(); + return nm->mkConst(String(sx.replace(sy, st))); + } + Unimplemented(); + return Node::null(); +} +Node Word::substr(TNode x, std::size_t i) +{ + NodeManager* nm = NodeManager::currentNM(); + Kind k = x.getKind(); + if (k == CONST_STRING) + { + String sx = x.getConst(); + return nm->mkConst(String(sx.substr(i))); + } + Unimplemented(); + return Node::null(); +} +Node Word::substr(TNode x, std::size_t i, std::size_t j) +{ + NodeManager* nm = NodeManager::currentNM(); + Kind k = x.getKind(); + if (k == CONST_STRING) + { + String sx = x.getConst(); + return nm->mkConst(String(sx.substr(i, j))); + } + Unimplemented(); + return Node::null(); +} + +Node Word::prefix(TNode x, std::size_t i) { return substr(x, 0, i); } + +Node Word::suffix(TNode x, std::size_t i) +{ + NodeManager* nm = NodeManager::currentNM(); + Kind k = x.getKind(); + if (k == CONST_STRING) + { + String sx = x.getConst(); + return nm->mkConst(String(sx.suffix(i))); + } + Unimplemented(); + return Node::null(); +} + +bool Word::noOverlapWith(TNode x, TNode y) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + return sx.noOverlapWith(sy); + } + Unimplemented(); + return false; +} + +std::size_t Word::overlap(TNode x, TNode y) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + return sx.overlap(sy); + } + Unimplemented(); + return 0; +} + +std::size_t Word::roverlap(TNode x, TNode y) +{ + Kind k = x.getKind(); + if (k == CONST_STRING) + { + Assert(y.getKind() == CONST_STRING); + String sx = x.getConst(); + String sy = y.getConst(); + return sx.roverlap(sy); + } + Unimplemented(); + return 0; +} + +} // namespace strings +} // namespace theory +} // namespace CVC4 diff --git a/src/theory/strings/word.h b/src/theory/strings/word.h new file mode 100644 index 000000000..e4d10247a --- /dev/null +++ b/src/theory/strings/word.h @@ -0,0 +1,121 @@ +/********************* */ +/*! \file word.h + ** \verbatim + ** Top contributors (to current version): + ** Andrew Reynolds + ** This file is part of the CVC4 project. + ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS + ** in the top-level source directory) and their institutional affiliations. + ** All rights reserved. See the file COPYING in the top-level source + ** directory for licensing information.\endverbatim + ** + ** \brief Utility functions for words. + **/ + +#include "cvc4_private.h" + +#ifndef CVC4__THEORY__STRINGS__WORD_H +#define CVC4__THEORY__STRINGS__WORD_H + +#include + +#include "expr/node.h" + +namespace CVC4 { +namespace theory { +namespace strings { + +// ------------------------------ for words (string or sequence constants) +class Word +{ +public: + /** make empty constant of kind k */ + static Node mkEmptyWord(Kind k); + + /** make word from constants in (non-empty) vector vec */ + static Node mkWord(const std::vector& xs); + + /** Return the length of word x */ + static size_t getLength(TNode x); + + /** Return true if x is empty */ + static bool isEmpty(TNode x); + + /** Return the first position y occurs in x, or std::string::npos otherwise */ + static std::size_t find(TNode x, TNode y, std::size_t start = 0); + + /** + * Return the first position y occurs in x searching from the end of x, or + * std::string::npos otherwise + */ + static std::size_t rfind(TNode x, TNode y, std::size_t start = 0); + + /** Returns true if y is a prefix of x */ + static bool hasPrefix(TNode x, TNode y); + + /** Returns true if y is a suffix of x */ + static bool hasSuffix(TNode x, TNode y); + + /** Replace the first occurrence of y in x with t */ + static Node replace(TNode x, TNode y, TNode t); + + /** Return the substring/subsequence of x starting at index i */ + static Node substr(TNode x, std::size_t i); + + /** Return the substring/subsequence of x starting at index i with size at most + * j */ + static Node substr(TNode x, std::size_t i, std::size_t j); + + /** Return the prefix of x of size at most i */ + static Node prefix(TNode x, std::size_t i); + + /** Return the suffix of x of size at most i */ + static Node suffix(TNode x, std::size_t i); + + /** + * Checks if there is any overlap between word x and another word y. This + * corresponds to checking whether one string contains the other and whether a + * substring/subsequence of one is a prefix of the other and vice-versa. + * + * @param x The first string + * @param y The second string + * @return True if there is an overlap, false otherwise + */ + static bool noOverlapWith(TNode x, TNode y); + + /** overlap + * + * if overlap returns m>0, + * then the maximal suffix of this string that is a prefix of y is of length m. + * + * For example, if x is "abcdef", then: + * x.overlap("defg") = 3 + * x.overlap("ab") = 0 + * x.overlap("d") = 0 + * x.overlap("bcdefdef") = 5 + */ + static std::size_t overlap(TNode x, TNode y); + + /** reverse overlap + * + * if roverlap returns m>0, + * then the maximal prefix of this word that is a suffix of y is of length m. + * + * For example, if x is "abcdef", then: + * x.roverlap("aaabc") = 3 + * x.roverlap("def") = 0 + * x.roverlap("d") = 0 + * x.roverlap("defabcde") = 5 + * + * Notice that x.overlap(y) = y.roverlap(x) + */ + static std::size_t roverlap(TNode x, TNode y); +}; + +// ------------------------------ end for words (string or sequence constants) + +} // namespace strings +} // namespace theory +} // namespace CVC4 + +#endif diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp index ee63308ab..4dfe68b51 100644 --- a/src/util/regexp.cpp +++ b/src/util/regexp.cpp @@ -462,6 +462,13 @@ String String::substr(std::size_t i, std::size_t j) const { return String(ret_vec); } +bool String::noOverlapWith(const String& y) const +{ + return y.find(*this) == std::string::npos + && this->find(y) == std::string::npos && this->overlap(y) == 0 + && y.overlap(*this) == 0; +} + bool String::isNumber() const { if (d_str.empty()) { return false; diff --git a/src/util/regexp.h b/src/util/regexp.h index 5b6dc2b62..1cde53687 100644 --- a/src/util/regexp.h +++ b/src/util/regexp.h @@ -140,34 +140,41 @@ class CVC4_PUBLIC String { bool isRepeated() const; bool tailcmp(const String& y, int& c) const; + /** + * Return the first position y occurs in this string, or std::string::npos + * otherwise. + */ std::size_t find(const String& y, const std::size_t start = 0) const; + /** + * Return the first position y occurs in this string searching from the end, + * or std::string::npos otherwise. + */ std::size_t rfind(const String& y, const std::size_t start = 0) const; /** Returns true if y is a prefix of this */ bool hasPrefix(const String& y) const; /** Returns true if y is a suffix of this */ bool hasSuffix(const String& y) const; - + /** Replace the first occurrence of s in this string with t */ String replace(const String& s, const String& t) const; + /** Return the substring of this string starting at index i */ String substr(std::size_t i) const; + /** Return the substring of this string starting at index i with size at most + * j */ String substr(std::size_t i, std::size_t j) const; - + /** Return the prefix of this string of size at most i */ String prefix(std::size_t i) const { return substr(0, i); } + /** Return the suffix of this string of size at most i */ String suffix(std::size_t i) const { return substr(size() - i, i); } /** * Checks if there is any overlap between this string and another string. This - * corresponds to checking whether one string contains the other and wether a + * corresponds to checking whether one string contains the other and whether a * substring of one is a prefix of the other and vice-versa. * * @param y The other string * @return True if there is an overlap, false otherwise */ - bool noOverlapWith(const String& y) const - { - return y.find(*this) == std::string::npos - && this->find(y) == std::string::npos && this->overlap(y) == 0 - && y.overlap(*this) == 0; - } + bool noOverlapWith(const String& y) const; /** string overlap * diff --git a/test/unit/theory/CMakeLists.txt b/test/unit/theory/CMakeLists.txt index 02ca5d8b5..f20088ad1 100644 --- a/test/unit/theory/CMakeLists.txt +++ b/test/unit/theory/CMakeLists.txt @@ -10,5 +10,6 @@ cvc4_add_unit_test_white(theory_quantifiers_bv_instantiator_white theory) cvc4_add_unit_test_white(theory_quantifiers_bv_inverter_white theory) cvc4_add_unit_test_white(theory_strings_rewriter_white theory) cvc4_add_unit_test_white(theory_strings_skolem_cache_black theory) +cvc4_add_unit_test_white(theory_strings_word_white theory) cvc4_add_unit_test_white(theory_white theory) cvc4_add_unit_test_white(type_enumerator_white theory) diff --git a/test/unit/theory/theory_strings_word_white.h b/test/unit/theory/theory_strings_word_white.h new file mode 100644 index 000000000..d84df7836 --- /dev/null +++ b/test/unit/theory/theory_strings_word_white.h @@ -0,0 +1,149 @@ +/********************* */ +/*! \file theory_strings_word_white.h + ** \verbatim + ** Top contributors (to current version): + ** Andrew Reynolds + ** This file is part of the CVC4 project. + ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS + ** in the top-level source directory) and their institutional affiliations. + ** All rights reserved. See the file COPYING in the top-level source + ** directory for licensing information.\endverbatim + ** + ** \brief Unit tests for the strings word utilities + **/ + +#include "expr/node.h" +#include "expr/node_manager.h" +#include "smt/smt_engine.h" +#include "smt/smt_engine_scope.h" +#include "theory/strings/word.h" + +#include +#include +#include +#include + +using namespace CVC4; +using namespace CVC4::smt; +using namespace CVC4::theory; +using namespace CVC4::theory::strings; + +class TheoryStringsWordWhite : public CxxTest::TestSuite +{ + public: + TheoryStringsWordWhite() {} + + void setUp() override + { + Options opts; + opts.setOutputLanguage(language::output::LANG_SMTLIB_V2); + d_em = new ExprManager(opts); + d_smt = new SmtEngine(d_em); + d_scope = new SmtScope(d_smt); + + d_nm = NodeManager::currentNM(); + } + + void tearDown() override + { + delete d_scope; + delete d_smt; + delete d_em; + } + + void testStrings() + { + Node empty = d_nm->mkConst(String("")); + Node a = d_nm->mkConst(String("a")); + Node b = d_nm->mkConst(String("b")); + Node aa = d_nm->mkConst(String("aa")); + Node aaaaa = d_nm->mkConst(String("aaaaa")); + Node abc = d_nm->mkConst(String("abc")); + Node bbc = d_nm->mkConst(String("bbc")); + Node cac = d_nm->mkConst(String("cac")); + Node abca = d_nm->mkConst(String("abca")); + + TS_ASSERT(Word::mkEmptyWord(kind::CONST_STRING) == empty); + + std::vector vec; + vec.push_back(abc); + Node abcMk = Word::mkWord(vec); + TS_ASSERT_EQUALS(abc, abcMk); + vec.push_back(a); + Node abcaMk = Word::mkWord(vec); + TS_ASSERT_EQUALS(abca, abcaMk); + + TS_ASSERT(Word::getLength(empty) == 0); + TS_ASSERT(Word::getLength(aaaaa) == 5); + + TS_ASSERT(Word::isEmpty(empty)); + TS_ASSERT(!Word::isEmpty(a)); + + TS_ASSERT(Word::find(empty, empty) == 0); + TS_ASSERT(Word::find(a, empty) == 0); + TS_ASSERT(Word::find(empty, empty, 1) == std::string::npos); + TS_ASSERT(Word::find(cac, a, 0) == 1); + TS_ASSERT(Word::find(cac, abc) == std::string::npos); + + TS_ASSERT(Word::rfind(aaaaa, empty) == 0); + TS_ASSERT(Word::rfind(aaaaa, a) == 0); + TS_ASSERT(Word::rfind(abca, abc, 1) == 1); + + TS_ASSERT(Word::hasPrefix(empty, empty)); + TS_ASSERT(Word::hasPrefix(a, empty)); + TS_ASSERT(Word::hasPrefix(aaaaa, a)); + TS_ASSERT(!Word::hasPrefix(abca, b)); + TS_ASSERT(!Word::hasPrefix(empty, a)); + + TS_ASSERT(Word::hasSuffix(empty, empty)); + TS_ASSERT(Word::hasSuffix(a, empty)); + TS_ASSERT(Word::hasSuffix(a, a)); + TS_ASSERT(!Word::hasSuffix(abca, b)); + TS_ASSERT(!Word::hasSuffix(empty, abc)); + + TS_ASSERT_EQUALS(bbc, Word::replace(abc, a, b)); + TS_ASSERT_EQUALS(aaaaa, Word::replace(aaaaa, b, a)); + TS_ASSERT_EQUALS(aa, Word::replace(a, empty, a)); + + TS_ASSERT_EQUALS(empty, Word::substr(a, 1)); + TS_ASSERT_EQUALS(empty, Word::substr(empty, 0)); + TS_ASSERT_EQUALS(a, Word::substr(abca, 3)); + + TS_ASSERT_EQUALS(a, Word::substr(abc, 0, 1)); + TS_ASSERT_EQUALS(aa, Word::substr(aaaaa, 3, 2)); + + TS_ASSERT_EQUALS(a, Word::prefix(a, 1)); + TS_ASSERT_EQUALS(empty, Word::prefix(empty, 0)); + TS_ASSERT_EQUALS(a, Word::prefix(aaaaa, 1)); + + TS_ASSERT_EQUALS(a, Word::suffix(a, 1)); + TS_ASSERT_EQUALS(empty, Word::suffix(empty, 0)); + TS_ASSERT_EQUALS(aa, Word::suffix(aaaaa, 2)); + + TS_ASSERT(!Word::noOverlapWith(abc, empty)); + TS_ASSERT(Word::noOverlapWith(cac, aa)); + TS_ASSERT(!Word::noOverlapWith(cac, abc)); + TS_ASSERT(Word::noOverlapWith(cac, b)); + TS_ASSERT(!Word::noOverlapWith(cac, a)); + TS_ASSERT(!Word::noOverlapWith(abca, a)); + + TS_ASSERT(Word::overlap(abc, empty) == 0); + TS_ASSERT(Word::overlap(aaaaa, abc) == 1); + TS_ASSERT(Word::overlap(cac, abc) == 0); + TS_ASSERT(Word::overlap(empty, abc) == 0); + TS_ASSERT(Word::overlap(aaaaa, aa) == 2); + + TS_ASSERT(Word::roverlap(abc, empty) == 0); + TS_ASSERT(Word::roverlap(aaaaa, abc) == 0); + TS_ASSERT(Word::roverlap(cac, abc) == 1); + TS_ASSERT(Word::roverlap(empty, abc) == 0); + TS_ASSERT(Word::roverlap(aaaaa, aa) == 2); + } + + private: + ExprManager* d_em; + SmtEngine* d_smt; + SmtScope* d_scope; + + NodeManager* d_nm; +}; -- 2.30.2