From 6f18015fdcb824f46b969882aa45187b46306e97 Mon Sep 17 00:00:00 2001 From: Andrew Reynolds Date: Wed, 18 Oct 2017 12:11:50 -0500 Subject: [PATCH] Strings API escape sequences (#1245) * Argument for strings class to specify whether to process escape sequences. * Change default value on string constructor. * Make CVC4::String::toString symmetric to the constructor for CVC4::String, document. * Clang format. --- src/parser/cvc/Cvc.g | 2 +- src/parser/smt2/Smt2.g | 4 +-- src/printer/smt2/smt2_printer.cpp | 4 +-- src/util/regexp.cpp | 16 ++++++---- src/util/regexp.h | 49 ++++++++++++++++++++++++++----- 5 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/parser/cvc/Cvc.g b/src/parser/cvc/Cvc.g index eef7ca54d..6337fb5f6 100644 --- a/src/parser/cvc/Cvc.g +++ b/src/parser/cvc/Cvc.g @@ -1978,7 +1978,7 @@ stringTerm[CVC4::Expr& f] /* string literal */ | str[s] - { f = MK_CONST(CVC4::String(s)); } + { f = MK_CONST(CVC4::String(s, true)); } | setsTerm[f] ; diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g index ce1cd1fbd..05faf040e 100644 --- a/src/parser/smt2/Smt2.g +++ b/src/parser/smt2/Smt2.g @@ -1031,7 +1031,7 @@ sygusGTerm[CVC4::SygusGTerm& sgt, std::string& fun] | str[s,false] { Debug("parser-sygus") << "Sygus grammar " << fun << " : string literal \"" << s << "\"" << std::endl; - sgt.d_expr = MK_CONST( ::CVC4::String(s) ); + sgt.d_expr = MK_CONST( ::CVC4::String(s, true) ); sgt.d_name = s; sgt.d_gterm_type = SygusGTerm::gterm_op; } @@ -2328,7 +2328,7 @@ termNonVariable[CVC4::Expr& expr, CVC4::Expr& expr2] expr = MK_CONST( BitVector(binString, 2) ); } | str[s,false] - { expr = MK_CONST( ::CVC4::String(s) ); } + { expr = MK_CONST( ::CVC4::String(s, true) ); } | FP_RNE_TOK { expr = MK_CONST(roundNearestTiesToEven); } | FP_RNA_TOK { expr = MK_CONST(roundNearestTiesToAway); } | FP_RTP_TOK { expr = MK_CONST(roundTowardPositive); } diff --git a/src/printer/smt2/smt2_printer.cpp b/src/printer/smt2/smt2_printer.cpp index 0a8020651..9c9d1fb76 100644 --- a/src/printer/smt2/smt2_printer.cpp +++ b/src/printer/smt2/smt2_printer.cpp @@ -94,7 +94,7 @@ static std::string maybeQuoteSymbol(const std::string& s) { static bool stringifyRegexp(Node n, stringstream& ss) { if(n.getKind() == kind::STRING_TO_REGEXP) { - ss << n[0].getConst().toString(); + ss << n[0].getConst().toString(true); } else if(n.getKind() == kind::REGEXP_CONCAT) { for(unsigned i = 0; i < n.getNumChildren(); ++i) { if(!stringifyRegexp(n[i], ss)) { @@ -256,7 +256,7 @@ void Smt2Printer::toStream(std::ostream& out, TNode n, case kind::CONST_STRING: { //const std::vector& s = n.getConst().getVec(); - std::string s = n.getConst().toString(); + std::string s = n.getConst().toString(true); out << '"'; for(size_t i = 0; i < s.size(); ++i) { //char c = String::convertUnsignedIntToChar(s[i]); diff --git a/src/util/regexp.cpp b/src/util/regexp.cpp index ef1ab9249..681b574a3 100644 --- a/src/util/regexp.cpp +++ b/src/util/regexp.cpp @@ -84,11 +84,12 @@ bool String::rstrncmp(const String &y, const std::size_t np) const { return true; } -std::vector String::toInternal(const std::string &s) { +std::vector String::toInternal(const std::string &s, + bool useEscSequences) { std::vector str; unsigned i = 0; while (i < s.size()) { - if (s[i] == '\\') { + if (s[i] == '\\' && useEscSequences) { i++; if (i < s.size()) { switch (s[i]) { @@ -140,6 +141,7 @@ std::vector String::toInternal(const std::string &s) { } break; default: { if (isdigit(s[i])) { + // octal escape sequences TODO : revisit (issue #1251). int num = (int)s[i] - (int)'0'; bool flag = num < 4; if (i + 1 < s.size() && num < 8 && isdigit(s[i + 1]) && @@ -171,7 +173,7 @@ std::vector String::toInternal(const std::string &s) { throw CVC4::Exception("should be handled by lexer: \"" + s + "\""); // str.push_back( convertCharToUnsignedInt('\\') ); } - } else if ((unsigned)s[i] > 127) { + } else if ((unsigned)s[i] > 127 && useEscSequences) { throw CVC4::Exception("Illegal String Literal: \"" + s + "\", must use escaped sequence"); } else { @@ -211,11 +213,13 @@ std::size_t String::roverlap(const String &y) const { return i; } -std::string String::toString() const { +std::string String::toString(bool useEscSequences) const { std::string str; for (unsigned int i = 0; i < size(); ++i) { unsigned char c = convertUnsignedIntToChar(d_str[i]); - if (isprint(c)) { + if (!useEscSequences) { + str += c; + } else if (isprint(c)) { if (c == '\\') { str += "\\\\"; } @@ -386,7 +390,7 @@ unsigned char String::hexToDec(unsigned char c) { } std::ostream &operator<<(std::ostream &os, const String &s) { - return os << "\"" << s.toString() << "\""; + return os << "\"" << s.toString(true) << "\""; } std::ostream &operator<<(std::ostream &out, const RegExp &s) { diff --git a/src/util/regexp.h b/src/util/regexp.h index f451a8dec..9d351dde4 100644 --- a/src/util/regexp.h +++ b/src/util/regexp.h @@ -44,9 +44,28 @@ class CVC4_PUBLIC String { return (c >= ' ' && c <= '~'); // isprint( (int)c ); } + /** constructors for String + * + * Internally, a CVC4::String is represented by a vector of unsigned + * integers (d_str), where the correspondence between C++ characters + * to and from unsigned integers is determined by + * by convertCharToUnsignedInt and convertUnsignedIntToChar. + * + * If useEscSequences is true, then the escape sequences in the input + * are converted to the corresponding character. This constructor may + * throw an exception if the input contains unrecognized escape sequences. + * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\, + * \x[N] where N is a hexidecimal, and octal escape sequences of the + * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7. + * + * If useEscSequences is false, then the characters of the constructed + * CVC4::String correspond one-to-one with the input string. + */ String() = default; - explicit String(const std::string& s) : d_str(toInternal(s)) {} - explicit String(const char* s) : d_str(toInternal(std::string(s))) {} + explicit String(const std::string& s, bool useEscSequences = false) + : d_str(toInternal(s, useEscSequences)) {} + explicit String(const char* s, bool useEscSequences = false) + : d_str(toInternal(std::string(s), useEscSequences)) {} explicit String(const unsigned char c) : d_str({convertCharToUnsignedInt(c)}) {} explicit String(const std::vector& s) : d_str(s) {} @@ -70,12 +89,27 @@ class CVC4_PUBLIC String { bool strncmp(const String& y, const std::size_t np) const; bool rstrncmp(const String& y, const std::size_t np) const; - /* - * Convenience functions - */ - std::string toString() const; + /* toString + * Converts this string to a std::string. + * + * If useEscSequences is true, then unprintable characters + * are converted to escape sequences. The escape sequences + * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way. + * For all other unprintable characters, we print \x[N] where + * [N] is the 2 digit hexidecimal corresponding to value of + * the character. + * + * If useEscSequences is false, the returned std::string's characters + * map one-to-one with the characters in this string. + * Notice that for all std::string s, we have that + * CVC4::String( s ).toString() = s. + */ + std::string toString(bool useEscSequences = false) const; + /** is this the empty string? */ bool empty() const { return d_str.empty(); } + /** is this the empty string? */ bool isEmptyString() const { return empty(); } + /** Return the length of the string */ std::size_t size() const { return d_str.size(); } unsigned char getFirstChar() const { return getUnsignedCharAt(0); } @@ -107,7 +141,8 @@ class CVC4_PUBLIC String { // guarded static unsigned char hexToDec(unsigned char c); - static std::vector toInternal(const std::string& s); + static std::vector toInternal(const std::string& s, + bool useEscSequences = true); unsigned char getUnsignedCharAt(size_t pos) const; /** -- 2.30.2