From: Andres Noetzli Date: Tue, 27 Nov 2018 16:10:36 +0000 (-0800) Subject: Reduce lookahead when parsing string literals (#2721) X-Git-Tag: cvc5-1.0.0~4352 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=99a1da848889776586436f7f9aec9a1b088703c1;p=cvc5.git Reduce lookahead when parsing string literals (#2721) --- diff --git a/src/parser/smt2/Smt2.g b/src/parser/smt2/Smt2.g index d9b0f622b..7143824d6 100644 --- a/src/parser/smt2/Smt2.g +++ b/src/parser/smt2/Smt2.g @@ -2611,65 +2611,50 @@ termList[std::vector& formulas, CVC4::Expr& expr] * Matches a string, and strips off the quotes. */ str[std::string& s, bool fsmtlib] - : STRING_LITERAL_2_0 - { s = AntlrInput::tokenText($STRING_LITERAL_2_0); + : STRING_LITERAL + { + s = AntlrInput::tokenText($STRING_LITERAL); /* strip off the quotes */ s = s.substr(1, s.size() - 2); - for(size_t i=0; i 127 && !isprint(s[i])) { - PARSER_STATE->parseError("Extended/unprintable characters are not " - "part of SMT-LIB, and they must be encoded " - "as escape sequences"); + for (size_t i = 0; i < s.size(); i++) + { + if ((unsigned)s[i] > 127 && !isprint(s[i])) + { + PARSER_STATE->parseError( + "Extended/unprintable characters are not " + "part of SMT-LIB, and they must be encoded " + "as escape sequences"); } } - if(fsmtlib) { - /* handle SMT-LIB standard escapes '\\' and '\"' */ + if (fsmtlib || PARSER_STATE->escapeDupDblQuote()) + { char* p_orig = strdup(s.c_str()); char *p = p_orig, *q = p_orig; - while(*q != '\0') { - if(*q == '\\') { + while (*q != '\0') + { + if (PARSER_STATE->escapeDupDblQuote() && *q == '"') + { + // Handle SMT-LIB >=2.5 standard escape '""'. + ++q; + assert(*q == '"'); + } + else if (!PARSER_STATE->escapeDupDblQuote() && *q == '\\') + { ++q; - if(*q == '\\' || *q == '"') { - *p++ = *q++; - } else { + // Handle SMT-LIB 2.0 standard escapes '\\' and '\"'. + if (*q != '\\' && *q != '"') + { assert(*q != '\0'); *p++ = '\\'; - *p++ = *q++; } - } else { - *p++ = *q++; } + *p++ = *q++; } *p = '\0'; s = p_orig; free(p_orig); } } - | STRING_LITERAL_2_5 - { s = AntlrInput::tokenText($STRING_LITERAL_2_5); - /* strip off the quotes */ - s = s.substr(1, s.size() - 2); - for(size_t i=0; i 127 && !isprint(s[i])) { - PARSER_STATE->parseError("Extended/unprintable characters are not " - "part of SMT-LIB, and they must be encoded " - "as escape sequences"); - } - } - // In the 2.5 version, always handle escapes (regardless of fsmtlib flag). - char* p_orig = strdup(s.c_str()); - char *p = p_orig, *q = p_orig; - while(*q != '\0') { - if(*q == '"') { - ++q; - assert(*q == '"'); - } - *p++ = *q++; - } - *p = '\0'; - s = p_orig; - free(p_orig); - } ; /** @@ -3276,37 +3261,25 @@ BINARY_LITERAL ; /** - * Matches a double-quoted string literal from SMT-LIB 2.0. - * Escaping is supported, and * escape character '\' has to be escaped. + * Matches a double-quoted string literal. Depending on the language that is + * being parsed, different escape sequences are supported: * - * You shouldn't generally use this in parser rules, as the quotes - * will be part of the token text. Use the str[] parser rule instead. - */ -STRING_LITERAL_2_0 - : { PARSER_STATE->v2_0() }?=> - '"' ('\\' . | ~('\\' | '"'))* '"' - ; - -/** - * Matches a double-quoted string literal from SMT-LIB 2.5. - * You escape a double-quote inside the string with "", e.g., - * "This is a string literal with "" a single, embedded double quote." + * For SMT-LIB 2.0 the sequence \" is interpreted as a double quote (") and the + * sequence \\ is interpreted as a backslash (\). + * + * For SMT-LIB >=2.5 and SyGuS a double-quote inside a string is escaped with + * "", e.g., "This is a string literal with "" a single, embedded double + * quote." * * You shouldn't generally use this in parser rules, as the quotes * will be part of the token text. Use the str[] parser rule instead. */ -STRING_LITERAL_2_5 - : { PARSER_STATE->v2_5() || PARSER_STATE->sygus() }?=> +STRING_LITERAL + : { !PARSER_STATE->escapeDupDblQuote() }?=> + '"' ('\\' . | ~('\\' | '"'))* '"' + | { PARSER_STATE->escapeDupDblQuote() }?=> '"' (~('"') | '""')* '"' ; - -/** - * Matches sygus quoted literal - */ -SYGUS_QUOTED_LITERAL - : { PARSER_STATE->sygus() }?=> - '"' (ALPHA|DIGIT)* '"' - ; /** * Matches the comments and ignores them diff --git a/src/parser/smt2/smt2.h b/src/parser/smt2/smt2.h index 2d57332af..7a3dbb9db 100644 --- a/src/parser/smt2/smt2.h +++ b/src/parser/smt2/smt2.h @@ -183,6 +183,13 @@ private: bool sygus() const { return getLanguage() == language::input::LANG_SYGUS; } + /** + * Returns true if the language that we are parsing (SMT-LIB version >=2.5 + * and SyGuS) treats duplicate double quotes ("") as an escape sequence + * denoting a single double quote ("). + */ + bool escapeDupDblQuote() const { return v2_5() || sygus(); } + void setInfo(const std::string& flag, const SExpr& sexpr); void setOption(const std::string& flag, const SExpr& sexpr);