Work towards support for the strings standard.
This updates the string solver and parser such that:
The internal representation of strings is vectors of code points,
Generation of the previous internal representation of strings has been relegated to the type enumerator. This is the code that ensures that "A" is the first character chosen for values of strings in models,
The previous ad-hoc escape sequence handling is moved from the String class to the parser. It will live there for at least one version of CVC4, until we no longer support non-smt-lib complaint escape sequences or non-printable characters in strings,
Handle unicode escape sequences according to the SMT-LIB standard in String,
Simplify a number of calls to String utility functions, since the conversion between the previous internal format and code points is now unnecessary,
Fixed a bug in the handling of TO_CODE: it should be based on the alphabet cardinality, not the number of internal code points.
/* string literal */
| str[s]
- { f = SOLVER->mkString(s, true); }
+ { f = PARSER_STATE->mkStringConstant(s); }
| setsTerm[f]
;
}
}
+std::vector<unsigned> Parser::processAdHocStringEsc(const std::string& s)
+{
+ std::vector<unsigned> str;
+ unsigned i = 0;
+ while (i < s.size())
+ {
+ // get the current character
+ if (s[i] != '\\')
+ {
+ // don't worry about printable here
+ str.push_back(static_cast<unsigned>(s[i]));
+ ++i;
+ continue;
+ }
+ // slash is always escaped
+ ++i;
+ if (i >= s.size())
+ {
+ // slash cannot be the last character if we are parsing escape sequences
+ std::stringstream serr;
+ serr << "Escape sequence at the end of string: \"" << s
+ << "\" should be handled by lexer";
+ parseError(serr.str());
+ }
+ switch (s[i])
+ {
+ case 'n':
+ {
+ str.push_back(static_cast<unsigned>('\n'));
+ i++;
+ }
+ break;
+ case 't':
+ {
+ str.push_back(static_cast<unsigned>('\t'));
+ i++;
+ }
+ break;
+ case 'v':
+ {
+ str.push_back(static_cast<unsigned>('\v'));
+ i++;
+ }
+ break;
+ case 'b':
+ {
+ str.push_back(static_cast<unsigned>('\b'));
+ i++;
+ }
+ break;
+ case 'r':
+ {
+ str.push_back(static_cast<unsigned>('\r'));
+ i++;
+ }
+ break;
+ case 'f':
+ {
+ str.push_back(static_cast<unsigned>('\f'));
+ i++;
+ }
+ break;
+ case 'a':
+ {
+ str.push_back(static_cast<unsigned>('\a'));
+ i++;
+ }
+ break;
+ case '\\':
+ {
+ str.push_back(static_cast<unsigned>('\\'));
+ i++;
+ }
+ break;
+ case 'x':
+ {
+ bool isValid = false;
+ if (i + 2 < s.size())
+ {
+ if (std::isxdigit(s[i + 1]) && std::isxdigit(s[i + 2]))
+ {
+ std::stringstream shex;
+ shex << s[i + 1] << s[i + 2];
+ unsigned val;
+ shex >> std::hex >> val;
+ str.push_back(val);
+ i += 3;
+ isValid = true;
+ }
+ }
+ if (!isValid)
+ {
+ std::stringstream serr;
+ serr << "Illegal String Literal: \"" << s
+ << "\", must have two digits after \\x";
+ parseError(serr.str());
+ }
+ }
+ break;
+ default:
+ {
+ if (std::isdigit(s[i]))
+ {
+ // octal escape sequences TODO : revisit (issue #1251).
+ unsigned num = static_cast<unsigned>(s[i]) - 48;
+ bool flag = num < 4;
+ if (i + 1 < s.size() && num < 8 && std::isdigit(s[i + 1])
+ && s[i + 1] < '8')
+ {
+ num = num * 8 + static_cast<unsigned>(s[i + 1]) - 48;
+ if (flag && i + 2 < s.size() && std::isdigit(s[i + 2])
+ && s[i + 2] < '8')
+ {
+ num = num * 8 + static_cast<unsigned>(s[i + 2]) - 48;
+ str.push_back(num);
+ i += 3;
+ }
+ else
+ {
+ str.push_back(num);
+ i += 2;
+ }
+ }
+ else
+ {
+ str.push_back(num);
+ i++;
+ }
+ }
+ }
+ }
+ }
+ return str;
+}
+
+Expr Parser::mkStringConstant(const std::string& s)
+{
+ ExprManager* em = d_solver->getExprManager();
+ if (em->getOptions().getInputLanguage()
+ == language::input::LANG_SMTLIB_V2_6_1)
+ {
+ return d_solver->mkString(s, true).getExpr();
+ }
+ // otherwise, we must process ad-hoc escape sequences
+ std::vector<unsigned> str = processAdHocStringEsc(s);
+ return d_solver->mkString(str).getExpr();
+}
+
} /* CVC4::parser namespace */
} /* CVC4 namespace */
name, api::sortVectorToTypes(argTypes));
}
//------------------------ end operator overloading
+ /**
+ * Make string constant
+ *
+ * This makes the string constant based on the string s. This may involve
+ * processing ad-hoc escape sequences (if the language is not
+ * SMT-LIB 2.6.1 or higher), or otherwise calling the solver to construct
+ * the string.
+ */
+ Expr mkStringConstant(const std::string& s);
+
+ private:
+ /** ad-hoc string escaping
+ *
+ * Returns the (internal) vector of code points corresponding to processing
+ * the escape sequences in string s. This is to support string inputs that
+ * do no comply with the SMT-LIB standard.
+ *
+ * This method handles escape sequences, including \n, \t, \v, \b, \r, \f, \a,
+ * \\, \x[N] and octal escape sequences of the form \[c1]([c2]([c3])?)? where
+ * c1, c2, c3 are digits from 0 to 7.
+ */
+ std::vector<unsigned> processAdHocStringEsc(const std::string& s);
};/* class Parser */
}/* CVC4::parser namespace */
}
// String constant
- | str[s,false] { atomTerm = SOLVER->mkString(s, true); }
+ | str[s,false] { atomTerm = PARSER_STATE->mkStringConstant(s); }
// NOTE: Theory constants go here
std::stringstream ssv;
if (varCounter < 26)
{
- ssv << String::convertUnsignedIntToChar(varCounter + 32);
+ ssv << static_cast<char>(varCounter + 61);
}
else
{
toStreamRational(out, n, false);
break;
}
+ case kind::CONST_STRING:
+ {
+ out << '"' << n.getConst<String>().toString() << '"';
+ break;
+ }
case kind::TYPE_CONSTANT:
switch(TypeConstant tc = n.getConst<TypeConstant>()) {
case REAL_TYPE:
}
case kind::CONST_STRING: {
- std::string s = n.getConst<String>().toString(true);
+ std::string s = n.getConst<String>().toString();
out << '"';
for(size_t i = 0; i < s.size(); ++i) {
char c = s[i];
const String& s = results[currNode[0]].d_str;
if (s.size() == 1)
{
- results[currNode] = EvalResult(
- Rational(String::convertUnsignedIntToCode(s.getVec()[0])));
+ results[currNode] = EvalResult(Rational(s.getVec()[0]));
}
else
{
for (unsigned ch : alphas)
{
d_rstring_alphabet.push_back(ch);
- Trace("sygus-sample-str-alpha")
- << " \"" << String::convertUnsignedIntToChar(ch) << "\"";
+ Trace("sygus-sample-str-alpha") << " \\u" << ch;
}
Trace("sygus-sample-str-alpha") << std::endl;
}
}
case kind::REGEXP_RANGE: {
unsigned a = r[0].getConst<String>().front();
- a = String::convertUnsignedIntToCode(a);
unsigned b = r[1].getConst<String>().front();
- b = String::convertUnsignedIntToCode(b);
Assert(a < b);
Assert(b < std::numeric_limits<unsigned>::max());
for (unsigned c = a; c <= b; c++)
String s = st.getConst<String>();
if(s.size() != 0) {
unsigned sc = s.front();
- sc = String::convertUnsignedIntToCode(sc);
cset.insert(sc);
}
}
if(st[0].isConst()) {
String s = st[0].getConst<String>();
unsigned sc = s.front();
- sc = String::convertUnsignedIntToCode(sc);
cset.insert(sc);
} else {
vset.insert( st[0] );
case kind::REGEXP_RANGE: {
std::vector< Node > vec;
unsigned a = r[0].getConst<String>().front();
- a = String::convertUnsignedIntToCode(a);
unsigned b = r[1].getConst<String>().front();
- b = String::convertUnsignedIntToCode(b);
for (unsigned c = a; c <= b; c++)
{
std::vector<unsigned> tmpVec;
- tmpVec.push_back(String::convertCodeToUnsignedInt(c));
+ tmpVec.push_back(c);
Node tmp = s.eqNode(nm->mkConst(String(tmpVec))).negate();
vec.push_back( tmp );
}
++it)
{
std::vector<unsigned> cvec;
- cvec.push_back(String::convertCodeToUnsignedInt(*it));
+ cvec.push_back(*it);
String c(cvec);
Trace("regexp-int-debug") << "Try character " << c << " ... " << std::endl;
Node r1l = derivativeSingle(r1, c);
if (s.size() == index_start + 1)
{
unsigned a = r[0].getConst<String>().front();
- a = String::convertUnsignedIntToCode(a);
unsigned b = r[1].getConst<String>().front();
- b = String::convertUnsignedIntToCode(b);
unsigned c = s.back();
- c = String::convertUnsignedIntToCode(c);
return (a <= c && c <= b);
}
else
std::vector<unsigned> nvec = node[0].getConst<String>().getVec();
for (unsigned i = 0, nvsize = nvec.size(); i < nvsize; i++)
{
- unsigned newChar = String::convertUnsignedIntToCode(nvec[i]);
+ unsigned newChar = nvec[i];
// transform it
// upper 65 ... 90
// lower 97 ... 122
newChar = newChar + 32;
}
}
- newChar = String::convertCodeToUnsignedInt(newChar);
nvec[i] = newChar;
}
Node retNode = nm->mkConst(String(nvec));
{
std::vector<unsigned> vec = s.getVec();
Assert(vec.size() == 1);
- ret = nm->mkConst(Rational(String::convertUnsignedIntToCode(vec[0])));
+ ret = nm->mkConst(Rational(vec[0]));
}
else
{
ctv.getConst<Rational>().getNumerator().toUnsignedInt();
Trace("strings-model") << "(code: " << cvalue << ") ";
std::vector<unsigned> vec;
- vec.push_back(String::convertCodeToUnsignedInt(cvalue));
+ vec.push_back(cvalue);
Node mv = nm->mkConst(String(vec));
pure_eq_assign[eqc] = mv;
m->getEqualityEngine()->addTerm(mv);
else if (n.getKind() == STRING_TO_CODE)
{
d_has_str_code = true;
- // ite( str.len(s)==1, 0 <= str.code(s) < num_codes, str.code(s)=-1 )
+ // ite( str.len(s)==1, 0 <= str.code(s) < |A|, str.code(s)=-1 )
Node code_len = utils::mkNLength(n[0]).eqNode(d_one);
Node code_eq_neg1 = n.eqNode(d_neg_one);
Node code_range = nm->mkNode(
AND,
nm->mkNode(GEQ, n, d_zero),
- nm->mkNode(LT, n, nm->mkConst(Rational(CVC4::String::num_codes()))));
+ nm->mkNode(
+ LT, n, nm->mkConst(Rational(utils::getAlphabetCardinality()))));
regTermLem = nm->mkNode(ITE, code_len, code_range, code_eq_neg1);
}
else if (n.getKind() == STRING_STRIDOF)
throw TypeCheckingExceptionPrivate(n, "expecting a single constant string term in regexp range");
}
unsigned ci = (*it).getConst<String>().front();
- ch[i] = String::convertUnsignedIntToCode(ci);
+ ch[i] = ci;
++it;
}
if(ch[0] > ch[1]) {
namespace theory {
namespace strings {
+Node makeStandardModelConstant(const std::vector<unsigned>& vec,
+ uint32_t cardinality)
+{
+ std::vector<unsigned> mvec;
+ // if we contain all of the printable characters
+ if (cardinality >= 255)
+ {
+ for (unsigned i = 0, vsize = vec.size(); i < vsize; i++)
+ {
+ unsigned curr = vec[i];
+ // convert
+ Assert(vec[i] < cardinality);
+ if (vec[i] <= 61)
+ {
+ // first 62 printable characters [\u{65}-\u{126}]: 'A', 'B', 'C', ...
+ curr = vec[i] + 65;
+ }
+ else if (vec[i] <= 94)
+ {
+ // remaining 33 printable characters [\u{32}-\u{64}]: ' ', '!', '"', ...
+ curr = vec[i] - 30;
+ }
+ else
+ {
+ // the remaining characters, starting with \u{127} and wrapping around
+ // the first 32 non-printable characters.
+ curr = (vec[i] + 32) % cardinality;
+ }
+ mvec.push_back(curr);
+ }
+ }
+ else
+ {
+ mvec = vec;
+ }
+ return NodeManager::currentNM()->mkConst(String(mvec));
+}
+
WordIter::WordIter(uint32_t startLength) : d_hasEndLength(false), d_endLength(0)
{
for (uint32_t i = 0; i < startLength; i++)
void StringEnumLen::mkCurr()
{
- d_curr = NodeManager::currentNM()->mkConst(String(d_witer->getData()));
+ d_curr = makeStandardModelConstant(d_witer->getData(), d_cardinality);
}
StringEnumerator::StringEnumerator(TypeNode type, TypeEnumeratorProperties* tep)
namespace theory {
namespace strings {
+/**
+ * Make standard model constant
+ *
+ * In our string representation, we represent characters using vectors
+ * of unsigned integers indicating code points for the characters of that
+ * string.
+ *
+ * To make models user-friendly, we make unsigned integer 0 correspond to the
+ * 65th character ("A") in the ASCII alphabet to make models intuitive. In
+ * particular, say if we have a set of string variables that are distinct but
+ * otherwise unconstrained, then the model may assign them "A", "B", "C", ...
+ *
+ * @param vec The code points of the string in a given model,
+ * @param cardinality The cardinality of the alphabet,
+ * @return A string whose characters have the code points corresponding
+ * to vec in the standard model construction described above.
+ */
+Node makeStandardModelConstant(const std::vector<unsigned>& vec,
+ uint32_t cardinality);
+
/**
* Generic iteration over vectors of indices of a given start/end length.
*/
static_assert(UCHAR_MAX == 255, "Unsigned char is assumed to have 256 values.");
-unsigned String::convertCharToUnsignedInt(unsigned char c)
-{
- return convertCodeToUnsignedInt(static_cast<unsigned>(c));
-}
-unsigned char String::convertUnsignedIntToChar(unsigned i)
-{
- Assert(i < num_codes());
- return static_cast<unsigned char>(convertUnsignedIntToCode(i));
-}
-bool String::isPrintable(unsigned i)
-{
- Assert(i < num_codes());
- unsigned char c = convertUnsignedIntToChar(i);
- return (c >= ' ' && c <= '~');
-}
-unsigned String::convertCodeToUnsignedInt(unsigned c)
-{
- Assert(c < num_codes());
- return (c < start_code() ? c + num_codes() : c) - start_code();
-}
-unsigned String::convertUnsignedIntToCode(unsigned i)
-{
- Assert(i < num_codes());
- return (i + start_code()) % num_codes();
-}
-
String::String(const std::vector<unsigned> &s) : d_str(s)
{
#ifdef CVC4_ASSERTIONS
for (unsigned u : d_str)
{
- Assert(convertUnsignedIntToCode(u) < num_codes());
+ Assert(u < num_codes());
}
#endif
}
}
for (unsigned int i = 0; i < size(); ++i) {
if (d_str[i] != y.d_str[i]) {
- unsigned cp = convertUnsignedIntToCode(d_str[i]);
- unsigned cpy = convertUnsignedIntToCode(y.d_str[i]);
+ unsigned cp = d_str[i];
+ unsigned cpy = y.d_str[i];
return cp < cpy ? -1 : 1;
}
}
return true;
}
-std::vector<unsigned> String::toInternal(const std::string &s,
- bool useEscSequences) {
+void String::addCharToInternal(unsigned char ch, std::vector<unsigned>& str)
+{
+ // if not a printable character
+ if (ch > 127 || ch < 32)
+ {
+ std::stringstream serr;
+ serr << "Illegal string character: \"" << ch
+ << "\", must use escape sequence";
+ throw CVC4::Exception(serr.str());
+ }
+ else
+ {
+ str.push_back(static_cast<unsigned>(ch));
+ }
+}
+
+std::vector<unsigned> String::toInternal(const std::string& s,
+ bool useEscSequences)
+{
std::vector<unsigned> str;
unsigned i = 0;
- while (i < s.size()) {
- if (s[i] == '\\' && useEscSequences) {
- i++;
- if (i < s.size()) {
- switch (s[i]) {
- case 'n': {
- str.push_back(convertCharToUnsignedInt('\n'));
- i++;
- } break;
- case 't': {
- str.push_back(convertCharToUnsignedInt('\t'));
- i++;
- } break;
- case 'v': {
- str.push_back(convertCharToUnsignedInt('\v'));
- i++;
- } break;
- case 'b': {
- str.push_back(convertCharToUnsignedInt('\b'));
- i++;
- } break;
- case 'r': {
- str.push_back(convertCharToUnsignedInt('\r'));
- i++;
- } break;
- case 'f': {
- str.push_back(convertCharToUnsignedInt('\f'));
- i++;
- } break;
- case 'a': {
- str.push_back(convertCharToUnsignedInt('\a'));
- i++;
- } break;
- case '\\': {
- str.push_back(convertCharToUnsignedInt('\\'));
- i++;
- } break;
- case 'x': {
- if (i + 2 < s.size()) {
- if (isxdigit(s[i + 1]) && isxdigit(s[i + 2])) {
- str.push_back(convertCharToUnsignedInt(hexToDec(s[i + 1]) * 16 +
- hexToDec(s[i + 2])));
- i += 3;
- } else {
- throw CVC4::Exception("Illegal String Literal: \"" + s + "\"");
- }
- } else {
- throw CVC4::Exception("Illegal String Literal: \"" + s +
- "\", must have two digits after \\x");
- }
- } break;
- default: {
- if (isdigit(s[i])) {
- // octal escape sequences TODO : revisit (issue #1251).
- int num = (int)s[i] - (int)'0';
- bool flag = num < 4;
- if (i + 1 < s.size() && num < 8 && isdigit(s[i + 1]) &&
- s[i + 1] < '8') {
- num = num * 8 + (int)s[i + 1] - (int)'0';
- if (flag && i + 2 < s.size() && isdigit(s[i + 2]) &&
- s[i + 2] < '8') {
- num = num * 8 + (int)s[i + 2] - (int)'0';
- str.push_back(convertCharToUnsignedInt((unsigned char)num));
- i += 3;
- } else {
- str.push_back(convertCharToUnsignedInt((unsigned char)num));
- i += 2;
- }
- } else {
- str.push_back(convertCharToUnsignedInt((unsigned char)num));
- i++;
- }
- } else if ((unsigned)s[i] > 127) {
- throw CVC4::Exception("Illegal String Literal: \"" + s +
- "\", must use escaped sequence");
- } else {
- str.push_back(convertCharToUnsignedInt(s[i]));
- i++;
- }
+ while (i < s.size())
+ {
+ // get the current character
+ char si = s[i];
+ if (si != '\\' || !useEscSequences)
+ {
+ addCharToInternal(si, str);
+ ++i;
+ continue;
+ }
+ // the vector of characters, in case we fail to read an escape sequence
+ std::vector<unsigned> nonEscCache;
+ // process the '\'
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ // are we an escape sequence?
+ bool isEscapeSequence = true;
+ // the string corresponding to the hexidecimal code point
+ std::stringstream hexString;
+ // is the slash followed by a 'u'? Could be last character.
+ if (i >= s.size() || s[i] != 'u')
+ {
+ isEscapeSequence = false;
+ }
+ else
+ {
+ // process the 'u'
+ addCharToInternal(s[i], nonEscCache);
+ ++i;
+ bool isStart = true;
+ bool isEnd = false;
+ bool hasBrace = false;
+ while (i < s.size())
+ {
+ // add the next character
+ si = s[i];
+ if (isStart)
+ {
+ isStart = false;
+ // possibly read '{'
+ if (si == '{')
+ {
+ hasBrace = true;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ continue;
}
}
- } else {
- throw CVC4::Exception("should be handled by lexer: \"" + s + "\"");
- // str.push_back( convertCharToUnsignedInt('\\') );
+ else if (si == '}')
+ {
+ // can only end if we had an open brace and read at least one digit
+ isEscapeSequence = hasBrace && !hexString.str().empty();
+ isEnd = true;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ break;
+ }
+ // must be a hex digit at this point
+ if (!isHexDigit(static_cast<unsigned>(si)))
+ {
+ isEscapeSequence = false;
+ break;
+ }
+ hexString << si;
+ addCharToInternal(si, nonEscCache);
+ ++i;
+ if (!hasBrace && hexString.str().size() == 4)
+ {
+ // will be finished reading \ u d_3 d_2 d_1 d_0 with no parens
+ isEnd = true;
+ break;
+ }
+ else if (hasBrace && hexString.str().size() > 5)
+ {
+ // too many digits enclosed in brace, not an escape sequence
+ isEscapeSequence = false;
+ break;
+ }
+ }
+ if (!isEnd)
+ {
+ // if we were interupted before ending, then this is not a valid
+ // escape sequence
+ isEscapeSequence = false;
+ }
+ }
+ if (isEscapeSequence)
+ {
+ Assert(!hexString.str().empty() && hexString.str().size() <= 5);
+ // Otherwise, we add the escaped character.
+ // This is guaranteed not to overflow due to the length of hstr.
+ uint32_t val;
+ hexString >> std::hex >> val;
+ if (val > num_codes())
+ {
+ // Failed due to being out of range. This can happen for strings of
+ // the form \ u { d_4 d_3 d_2 d_1 d_0 } where d_4 is a hexidecimal not
+ // in the range [0-2].
+ isEscapeSequence = false;
+ }
+ else
+ {
+ str.push_back(val);
}
- } else if ((unsigned)s[i] > 127 && useEscSequences) {
- throw CVC4::Exception("Illegal String Literal: \"" + s +
- "\", must use escaped sequence");
- } else {
- str.push_back(convertCharToUnsignedInt(s[i]));
- i++;
+ }
+ // if we did not successfully parse an escape sequence, we add back all
+ // characters that we cached
+ if (!isEscapeSequence)
+ {
+ str.insert(str.end(), nonEscCache.begin(), nonEscCache.end());
}
}
#ifdef CVC4_ASSERTIONS
for (unsigned u : str)
{
- Assert(convertUnsignedIntToCode(u) < num_codes());
+ Assert(u < num_codes());
}
#endif
return str;
}
std::string String::toString(bool useEscSequences) const {
- std::string str;
+ std::stringstream str;
for (unsigned int i = 0; i < size(); ++i) {
- unsigned char c = convertUnsignedIntToChar(d_str[i]);
- if (!useEscSequences) {
- str += c;
- } else if (isprint(c)) {
- if (c == '\\') {
- str += "\\\\";
- }
- // else if(c == '\"') {
- // str += "\\\"";
- //}
- else {
- str += c;
- }
- } else {
- std::string s;
- switch (c) {
- case '\a':
- s = "\\a";
- break;
- case '\b':
- s = "\\b";
- break;
- case '\t':
- s = "\\t";
- break;
- case '\r':
- s = "\\r";
- break;
- case '\v':
- s = "\\v";
- break;
- case '\f':
- s = "\\f";
- break;
- case '\n':
- s = "\\n";
- break;
- case '\e':
- s = "\\e";
- break;
- default: {
- std::stringstream ss;
- ss << std::setfill('0') << std::setw(2) << std::hex << ((int)c);
- std::string t = ss.str();
- t = t.substr(t.size() - 2, 2);
- s = "\\x" + t;
- // std::string s2 = static_cast<std::ostringstream*>(
- // &(std::ostringstream() << (int)c) )->str();
- }
- }
- str += s;
+ // we always print forward slash as a code point so that it cannot
+ // be interpreted as specifying part of a code point, e.g. the string
+ // '\' + 'u' + '0' of length three.
+ if (isPrintable(d_str[i]) && d_str[i] != '\\' && !useEscSequences)
+ {
+ str << static_cast<char>(d_str[i]);
+ }
+ else
+ {
+ std::stringstream ss;
+ ss << std::hex << d_str[i];
+ str << "\\u{" << ss.str() << "}";
}
}
- return str;
+ return str.str();
}
bool String::isLeq(const String &y) const
{
return false;
}
- unsigned ci = convertUnsignedIntToCode(d_str[i]);
- unsigned cyi = convertUnsignedIntToCode(y.d_str[i]);
+ unsigned ci = d_str[i];
+ unsigned cyi = y.d_str[i];
if (ci > cyi)
{
return false;
bool String::isDigit(unsigned character)
{
- unsigned char c = convertUnsignedIntToChar(character);
- return c >= '0' && c <= '9';
+ // '0' to '9'
+ return 48 <= character && character <= 57;
+}
+
+bool String::isHexDigit(unsigned character)
+{
+ // '0' to '9' or 'A' to 'F' or 'a' to 'f'
+ return isDigit(character) || (65 <= character && character <= 70)
+ || (97 <= character && character <= 102);
+}
+
+bool String::isPrintable(unsigned character)
+{
+ // Unicode 0x00020 (' ') to 0x0007E ('~')
+ return 32 <= character && character <= 126;
}
size_t String::maxSize() { return std::numeric_limits<uint32_t>::max(); }
return Rational(toString());
}
-unsigned char String::hexToDec(unsigned char c) {
- if (c >= '0' && c <= '9') {
- return c - '0';
- } else if (c >= 'a' && c <= 'f') {
- return c - 'a' + 10;
- } else {
- Assert(c >= 'A' && c <= 'F');
- return c - 'A' + 10;
- }
-}
-
std::ostream &operator<<(std::ostream &os, const String &s) {
return os << "\"" << s.toString(true) << "\"";
}
*/
class CVC4_PUBLIC String {
public:
- /**
- * The start ASCII code. In our string representation below, we represent
- * characters using a vector d_str of unsigned integers. We refer to this as
- * the "internal representation" for the string.
- *
- * We make unsigned integer 0 correspond to the 65th character ("A") in the
- * ASCII alphabet to make models intuitive. In particular, say if we have
- * a set of string variables that are distinct but otherwise unconstrained,
- * then the model may assign them "A", "B", "C", ...
- */
- static inline unsigned start_code() { return 65; }
/**
* This is the cardinality of the alphabet that is representable by this
* class. Notice that this must be greater than or equal to the cardinality
* of the alphabet that the string theory reasons about.
*
* This must be strictly less than std::numeric_limits<unsigned>::max().
+ *
+ * As per the SMT-LIB standard for strings, we support the first 3 planes of
+ * Unicode characters, where 196608 = 3*16^4.
*/
- static inline unsigned num_codes() { return 256; }
- /**
- * Convert unsigned char to the unsigned used in the internal representation
- * in d_str below.
- */
- static unsigned convertCharToUnsignedInt(unsigned char c);
- /** Convert the internal unsigned to a unsigned char. */
- static unsigned char convertUnsignedIntToChar(unsigned i);
- /** Does the internal unsigned correspond to a printable character? */
- static bool isPrintable(unsigned i);
- /** get the internal unsigned for ASCII code c. */
- static unsigned convertCodeToUnsignedInt(unsigned c);
- /** get the ASCII code number that internal unsigned i corresponds to. */
- static unsigned convertUnsignedIntToCode(unsigned i);
-
+ static inline unsigned num_codes() { return 196608; }
/** constructors for String
- *
- * Internally, a CVC4::String is represented by a vector of unsigned
- * integers (d_str), where the correspondence between C++ characters
- * to and from unsigned integers is determined by
- * by convertCharToUnsignedInt and convertUnsignedIntToChar.
- *
- * If useEscSequences is true, then the escape sequences in the input
- * are converted to the corresponding character. This constructor may
- * throw an exception if the input contains unrecognized escape sequences.
- * Currently supported escape sequences are \n, \t, \v, \b, \r, \f, \a, \\,
- * \x[N] where N is a hexidecimal, and octal escape sequences of the
- * form \[c1]([c2]([c3])?)? where c1, c2, c3 are digits from 0 to 7.
- *
- * If useEscSequences is false, then the characters of the constructed
- * CVC4::String correspond one-to-one with the input string.
- */
+ *
+ * Internally, a CVC4::String is represented by a vector of unsigned
+ * integers (d_str) representing the code points of the characters.
+ *
+ * To build a string from a C++ string, we may process escape sequences
+ * according to the SMT-LIB standard. In particular, if useEscSequences is
+ * true, we convert unicode escape sequences:
+ * \u d_3 d_2 d_1 d_0
+ * \u{d_0}
+ * \u{d_1 d_0}
+ * \u{d_2 d_1 d_0}
+ * \u{d_3 d_2 d_1 d_0}
+ * \u{d_4 d_3 d_2 d_1 d_0}
+ * where d_0 ... d_4 are hexidecimal digits, to the appropriate character.
+ *
+ * If useEscSequences is false, then the characters of the constructed
+ * CVC4::String correspond one-to-one with the input string.
+ */
String() = default;
explicit String(const std::string& s, bool useEscSequences = false)
- : d_str(toInternal(s, useEscSequences)) {}
+ : d_str(toInternal(s, useEscSequences))
+ {
+ }
explicit String(const char* s, bool useEscSequences = false)
- : d_str(toInternal(std::string(s), useEscSequences)) {}
+ : d_str(toInternal(std::string(s), useEscSequences))
+ {
+ }
explicit String(const std::vector<unsigned>& s);
String& operator=(const String& y) {
bool rstrncmp(const String& y, std::size_t n) const;
/* toString
- * Converts this string to a std::string.
- *
- * If useEscSequences is true, then unprintable characters
- * are converted to escape sequences. The escape sequences
- * \n, \t, \v, \b, \r, \f, \a, \\ are printed in this way.
- * For all other unprintable characters, we print \x[N] where
- * [N] is the 2 digit hexidecimal corresponding to value of
- * the character.
- *
- * If useEscSequences is false, the returned std::string's characters
- * map one-to-one with the characters in this string.
- * Notice that for all std::string s, we have that
- * CVC4::String( s ).toString() = s.
- */
+ * Converts this string to a std::string.
+ *
+ * The unprintable characters are converted to unicode escape sequences as
+ * described above.
+ *
+ * If useEscSequences is false, the string's printable characters are
+ * printed as characters. Notice that for all std::string s having only
+ * printable characters, we have that
+ * CVC4::String( s ).toString() = s.
+ */
std::string toString(bool useEscSequences = false) const;
/** is this the empty string? */
bool empty() const { return d_str.empty(); }
bool isNumber() const;
/** Returns the corresponding rational for the text of this string. */
Rational toNumber() const;
- /** get the internal unsigned representation of this string */
+ /** Get the unsigned representation (code points) of this string */
const std::vector<unsigned>& getVec() const { return d_str; }
- /** get the internal unsigned value of the first character in this string */
+ /**
+ * Get the unsigned (code point) value of the first character in this string
+ */
unsigned front() const;
- /** get the internal unsigned value of the last character in this string */
+ /**
+ * Get the unsigned (code point) value of the last character in this string
+ */
unsigned back() const;
/** is the unsigned a digit?
- * The input should be the same type as the element type of d_str
- */
+ *
+ * This is true for code points between 48 ('0') and 57 ('9').
+ */
static bool isDigit(unsigned character);
+ /** is the unsigned a hexidecimal digit?
+ *
+ * This is true for code points between 48 ('0') and 57 ('9'), code points
+ * between 65 ('A') and 70 ('F) and code points between 97 ('a') and 102 ('f).
+ */
+ static bool isHexDigit(unsigned character);
+ /** is the unsigned a printable code point?
+ *
+ * This is true for Unicode 32 (' ') to 126 ('~').
+ */
+ static bool isPrintable(unsigned character);
/**
* Returns the maximum length of string representable by this class.
*/
static size_t maxSize();
private:
- // guarded
- static unsigned char hexToDec(unsigned char c);
-
+ /**
+ * Helper for toInternal: add character ch to vector vec, storing a string in
+ * internal format. This throws an error if ch is not a printable character,
+ * since non-printable characters must be escaped in SMT-LIB.
+ */
+ static void addCharToInternal(unsigned char ch, std::vector<unsigned>& vec);
+ /**
+ * Convert the string s to the internal format (vector of code points).
+ * The argument useEscSequences is whether to process unicode escape
+ * sequences.
+ */
static std::vector<unsigned> toInternal(const std::string& s,
- bool useEscSequences = true);
+ bool useEscSequences);
/**
* Returns a negative number if *this < y, 0 if *this and y are equal and a
regress0/strings/escchar.smt2
regress0/strings/escchar_25.smt2
regress0/strings/from_code.smt2
+ regress0/strings/gen-esc-seq.smt2
regress0/strings/hconst-092618.smt2
regress0/strings/idof-rewrites.smt2
regress0/strings/idof-sem.smt2
regress0/strings/leadingzero001.smt2
regress0/strings/loop001.smt2
regress0/strings/model001.smt2
+ regress0/strings/model-code-point.smt2
+ regress0/strings/model-friendly.smt2
regress0/strings/ncontrib-rewrites.smt2
regress0/strings/norn-31.smt2
regress0/strings/norn-simp-rew.smt2
regress0/strings/tolower-rrs.smt2
regress0/strings/tolower-simple.smt2
regress0/strings/type001.smt2
+ regress0/strings/unicode-esc.smt2
regress0/strings/unsound-0908.smt2
regress0/strings/unsound-repl-rewrite.smt2
regress0/sygus/General_plus10.sy
--- /dev/null
+; COMMAND-LINE: --produce-models --lang=smt2.6.1
+; EXPECT: sat
+; EXPECT: ((x "\u{5c}u1000"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-const x String)
+(assert (= x (str.++ "\u" "1000")))
+(check-sat)
+(get-value (x))
--- /dev/null
+; COMMAND-LINE: --lang=smt2.6.1 --produce-models
+; EXPECT: sat
+; EXPECT: ((x "\u{a}"))
+; EXPECT: ((y "\u{7f}"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-fun x () String)
+(declare-fun y () String)
+(assert (= (str.to_code x) 10))
+(assert (= (str.to_code y) 127))
+(check-sat)
+(get-value (x))
+(get-value (y))
--- /dev/null
+; COMMAND-LINE: --lang=smt2.6.1 --produce-models
+; EXPECT: sat
+; EXPECT: ((x "AAAAA"))
+(set-logic ALL)
+(set-info :status sat)
+(declare-fun x () String)
+(assert (= (str.len x) 5))
+(check-sat)
+(get-value (x))
--- /dev/null
+; COMMAND-LINE: --strings-exp --lang=smt2.6.1
+; EXPECT: sat
+(set-logic ALL)
+
+(assert (= "\u{14}" "\u0014"))
+(assert (= "\u{00}" "\u{0}"))
+(assert (= "\u0000" "\u{0}"))
+(assert (= (str.len "\u1234") 1))
+(assert (= (str.len "\u{1}") 1))
+(assert (= (str.len "\u{99}") 1))
+(assert (= (str.len "\u{779}") 1))
+(assert (= (str.len "\u{0779}") 1))
+(assert (= (str.len "\u{01779}") 1))
+(assert (= (str.len "\u{001779}") 10))
+(assert (= (str.len "\u{0vv79}") 9))
+(assert (= (str.len "\u{11\u1234}") 7))
+(assert (= (str.len "\u12345") 2))
+(assert (= (str.len "\uu") 3))
+(assert (= (str.len "\u{123}\u{567}") 2))
+(assert (= (str.len "\u{0017") 7))
+(assert (= (str.len "\\u00178") 3))
+(assert (= (str.len "2\u{}") 5))
+(assert (= (str.len "\uaaaa") 1))
+(assert (= (str.len "\uAAAA") 1))
+(assert (= (str.len "\u{0AbC}") 1))
+(assert (= (str.len "\u{E}") 1))
+(assert (= (str.len "\u{44444}") 9))
+(assert (= (str.len "\u") 2))
+
+(check-sat)
TS_ASSERT_THROWS_NOTHING(d_solver->mkString(""));
TS_ASSERT_THROWS_NOTHING(d_solver->mkString("asdfasdf"));
TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf").toString(),
- "\"asdf\\\\nasdf\"");
- TS_ASSERT_EQUALS(d_solver->mkString("asdf\\nasdf", true).toString(),
- "\"asdf\\nasdf\"");
+ "\"asdf\\u{5c}nasdf\"");
+ TS_ASSERT_EQUALS(d_solver->mkString("asdf\\u{005c}nasdf", true).toString(),
+ "\"asdf\\u{5c}nasdf\"");
}
void SolverBlack::testMkTerm()