From 7cec5cb5aba03324f1781a72457a2e1202d557ff Mon Sep 17 00:00:00 2001 From: Gereon Kremer Date: Tue, 2 Mar 2021 23:49:29 +0100 Subject: [PATCH] Improve handling of utf8 encoded inputs (#5694) This PR fixes an issue where utf8 encoded inputs are incorrectly parsed into CVC4::String. We now use std::mbtowc to first turn the char sequence from the std::string input into a std::wstring and then process this std::wstring one charactor (wchar_t) at a time. Fixes #5673 --- src/parser/parser.cpp | 55 ++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/src/parser/parser.cpp b/src/parser/parser.cpp index ab0571e53..f1c917363 100644 --- a/src/parser/parser.cpp +++ b/src/parser/parser.cpp @@ -17,6 +17,7 @@ #include "parser/parser.h" #include +#include #include #include #include @@ -737,21 +738,41 @@ SymbolManager* Parser::getSymbolManager() { return d_symman; } std::vector Parser::processAdHocStringEsc(const std::string& s) { + std::wstring ws; + { + std::setlocale(LC_ALL, "en_US.utf8"); + std::mbtowc(nullptr, nullptr, 0); + const char* end = s.data() + s.size(); + const char* ptr = s.data(); + for (wchar_t c; ptr != end; ) { + int res = std::mbtowc(&c, ptr, end - ptr); + if (res == -1) { + std::cerr << "Invalid escape sequence in " << s << std::endl; + break; + } else if (res == 0) { + break; + } else { + ws += c; + ptr += res; + } + } + } + std::vector str; unsigned i = 0; - while (i < s.size()) + while (i < ws.size()) { // get the current character - if (s[i] != '\\') + if (ws[i] != '\\') { // don't worry about printable here - str.push_back(static_cast(s[i])); + str.push_back(static_cast(ws[i])); ++i; continue; } // slash is always escaped ++i; - if (i >= s.size()) + if (i >= ws.size()) { // slash cannot be the last character if we are parsing escape sequences std::stringstream serr; @@ -759,7 +780,7 @@ std::vector Parser::processAdHocStringEsc(const std::string& s) << "\" should be handled by lexer"; parseError(serr.str()); } - switch (s[i]) + switch (ws[i]) { case 'n': { @@ -812,12 +833,12 @@ std::vector Parser::processAdHocStringEsc(const std::string& s) case 'x': { bool isValid = false; - if (i + 2 < s.size()) + if (i + 2 < ws.size()) { - if (std::isxdigit(s[i + 1]) && std::isxdigit(s[i + 2])) + if (std::isxdigit(ws[i + 1]) && std::isxdigit(ws[i + 2])) { - std::stringstream shex; - shex << s[i + 1] << s[i + 2]; + std::wstringstream shex; + shex << ws[i + 1] << ws[i + 2]; unsigned val; shex >> std::hex >> val; str.push_back(val); @@ -836,19 +857,19 @@ std::vector Parser::processAdHocStringEsc(const std::string& s) break; default: { - if (std::isdigit(s[i])) + if (std::isdigit(ws[i])) { // octal escape sequences TODO : revisit (issue #1251). - unsigned num = static_cast(s[i]) - 48; + unsigned num = static_cast(ws[i]) - 48; bool flag = num < 4; - if (i + 1 < s.size() && num < 8 && std::isdigit(s[i + 1]) - && s[i + 1] < '8') + if (i + 1 < ws.size() && num < 8 && std::isdigit(ws[i + 1]) + && ws[i + 1] < '8') { - num = num * 8 + static_cast(s[i + 1]) - 48; - if (flag && i + 2 < s.size() && std::isdigit(s[i + 2]) - && s[i + 2] < '8') + num = num * 8 + static_cast(ws[i + 1]) - 48; + if (flag && i + 2 < ws.size() && std::isdigit(ws[i + 2]) + && ws[i + 2] < '8') { - num = num * 8 + static_cast(s[i + 2]) - 48; + num = num * 8 + static_cast(ws[i + 2]) - 48; str.push_back(num); i += 3; } -- 2.30.2