/********************* */
-/** antlr_input.cpp
- ** Original author: cconway
- ** Major contributors: none
- ** Minor contributors (to current version): none
- ** This file is part of the CVC4 prototype.
- ** Copyright (c) 2009, 2010 The Analysis of Computer Systems Group (ACSys)
- ** Courant Institute of Mathematical Sciences
- ** New York University
+/*! \file antlr_input.cpp
+ ** \verbatim
+ ** Original author: Christopher L. Conway
+ ** Major contributors: Morgan Deters, Kshitij Bansal
+ ** Minor contributors (to current version): Francois Bobot
+ ** This file is part of the CVC4 project.
+ ** Copyright (c) 2009-2014 New York University and The University of Iowa
** See the file COPYING in the top-level source directory for licensing
- ** information.
+ ** information.\endverbatim
+ **
+ ** \brief A super-class for ANTLR-generated input language parsers.
**
** A super-class for ANTLR-generated input language parsers
**/
+#include "parser/antlr_input.h"
+// We rely on the inclusion of #include <antlr3.h> in "parser/antlr_input.h".
+// This is avoid having to undefine the symbols in <antlr3.h>.
+// See the documentation in "parser/antlr_undefines.h" for more
+// details.
+
#include <limits.h>
-#include <antlr3.h>
-
-#include "antlr_input.h"
-#include "bounded_token_buffer.h"
-#include "bounded_token_factory.h"
-#include "memory_mapped_input_buffer.h"
-#include "parser_exception.h"
-#include "parser_state.h"
-
-#include "util/output.h"
-#include "util/Assert.h"
-#include "expr/command.h"
+#include <stdint.h>
+
+#include "base/output.h"
#include "expr/type.h"
+#include "parser/antlr_line_buffered_input.h"
+#include "parser/bounded_token_buffer.h"
+#include "parser/bounded_token_factory.h"
+#include "parser/cvc/cvc_input.h"
+#include "parser/input.h"
+#include "parser/memory_mapped_input_buffer.h"
+#include "parser/parser.h"
+#include "parser/parser_exception.h"
+#include "parser/smt1/smt1_input.h"
+#include "parser/smt2/smt2_input.h"
+#include "parser/smt2/sygus_input.h"
+#include "parser/tptp/tptp_input.h"
+#include "smt_util/command.h"
using namespace std;
using namespace CVC4;
namespace CVC4 {
namespace parser {
-AntlrInput::AntlrInput(ExprManager* exprManager, const std::string& filename, unsigned int lookahead, bool useMmap) :
- Input(exprManager, filename),
- d_lookahead(lookahead),
- d_lexer(NULL),
- d_parser(NULL),
- d_tokenStream(NULL) {
+// These functions exactly wrap the antlr3 source inconsistencies.
+// These are the only location CVC4_ANTLR3_OLD_INPUT_STREAM ifdefs appear.
+// No other sanity checking happens;
+pANTLR3_INPUT_STREAM newAntlr3BufferedStream(std::istream& input, const std::string& name);
+pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name);
+pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
+ uint32_t size,
+ const std::string& name);
+
+pANTLR3_INPUT_STREAM
+newAntlr3BufferedStream(std::istream& input, const std::string& name){
+ pANTLR3_INPUT_STREAM inputStream = NULL;
+ pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
+
+#ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
+ inputStream =
+ antlr3LineBufferedStreamNew(input, 0, name_duplicate);
+#else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
+ inputStream =
+ antlr3LineBufferedStreamNew(input, ANTLR3_ENC_8BIT, name_duplicate);
+#endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
+
+ free(name_duplicate);
+ return inputStream;
+}
+
+pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name){
+ pANTLR3_INPUT_STREAM input = NULL;
+ pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
+
+ // libantlr3c v3.2 isn't source-compatible with v3.4
+#ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
+ input = antlr3AsciiFileStreamNew(name_duplicate);
+#else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
+ input = antlr3FileStreamNew(name_duplicate, ANTLR3_ENC_8BIT);
+#endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
+
+ free(name_duplicate);
+ return input;
+}
+
+
+pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
+ uint32_t size,
+ const std::string& name){
+ pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
+ pANTLR3_INPUT_STREAM inputStream = NULL;
+ /* Create an ANTLR input backed by the buffer. */
+#ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
+ inputStream =
+ antlr3NewAsciiStringInPlaceStream(basep, size, name_duplicate);
+#else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
+ inputStream =
+ antlr3StringStreamNew(basep, ANTLR3_ENC_8BIT, size,
+ name_duplicate);
+#endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
+ free(name_duplicate);
+ return inputStream;
+}
+
+AntlrInputStream::AntlrInputStream(std::string name,
+ pANTLR3_INPUT_STREAM input,
+ bool fileIsTemporary,
+ pANTLR3_UINT8 inputString) :
+ InputStream(name, fileIsTemporary),
+ d_input(input),
+ d_inputString(inputString)
+{
+ assert( input != NULL );
+ input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
+}
+
+AntlrInputStream::~AntlrInputStream() {
+ d_input->free(d_input);
+ if(d_inputString != NULL){
+ free(d_inputString);
+ }
+}
+
+pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
+ return d_input;
+}
+
+
+
+AntlrInputStream*
+AntlrInputStream::newFileInputStream(const std::string& name,
+ bool useMmap)
+ throw (InputStreamException) {
+#ifdef _WIN32
+ if(useMmap) {
+ useMmap = false;
+ }
+#endif
+ pANTLR3_INPUT_STREAM input = NULL;
+ if(useMmap) {
+ input = MemoryMappedInputBufferNew(name);
+ } else {
+ input = newAntlr3FileStream(name);
+ }
+ if(input == NULL) {
+ throw InputStreamException("Couldn't open file: " + name);
+ }
+ return new AntlrInputStream( name, input, false, NULL );
+}
+
+
+AntlrInputStream*
+AntlrInputStream::newStreamInputStream(std::istream& input,
+ const std::string& name,
+ bool lineBuffered)
+ throw (InputStreamException) {
- if( useMmap ) {
- d_input = MemoryMappedInputBufferNew(filename);
+ pANTLR3_INPUT_STREAM inputStream = NULL;
+ pANTLR3_UINT8 inputStringCopy = NULL;
+
+ if(lineBuffered) {
+ inputStream = newAntlr3BufferedStream(input, name);
} else {
- d_input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) filename.c_str());
+
+ // Since these are all NULL on entry, realloc will be called
+ char *basep = NULL, *boundp = NULL, *cp = NULL;
+ /* 64KB seems like a reasonable default size. */
+ size_t bufSize = 0x10000;
+
+ /* Keep going until we can't go no more. */
+ while( !input.eof() && !input.fail() ) {
+
+ if( cp == boundp ) {
+ /* We ran out of room in the buffer. Realloc at double the size. */
+ ptrdiff_t offset = cp - basep;
+ basep = (char *) realloc(basep, bufSize);
+ if( basep == NULL ) {
+ throw InputStreamException("Failed buffering input stream: " + name);
+ }
+ cp = basep + offset;
+ boundp = basep + bufSize;
+ bufSize *= 2;
+ }
+
+ /* Read as much as we have room for. */
+ input.read( cp, boundp - cp );
+ cp += input.gcount();
+ }
+
+ /* Make sure the fail bit didn't get set. */
+ if( !input.eof() ) {
+ throw InputStreamException("Stream input failed: " + name);
+ }
+ ptrdiff_t offset = cp - basep;
+ assert(offset >= 0);
+ assert(offset <= std::numeric_limits<uint32_t>::max());
+ inputStringCopy = (pANTLR3_UINT8)basep;
+ inputStream = newAntrl3InPlaceStream(inputStringCopy, (uint32_t) offset, name);
+ }
+
+ if( inputStream == NULL ) {
+ throw InputStreamException("Couldn't initialize input: " + name);
+ }
+
+ return new AntlrInputStream( name, inputStream, false, inputStringCopy );
+}
+
+
+AntlrInputStream*
+AntlrInputStream::newStringInputStream(const std::string& input,
+ const std::string& name)
+ throw (InputStreamException) {
+
+ size_t input_size = input.size();
+ assert(input_size <= std::numeric_limits<uint32_t>::max());
+
+ // Ownership of input_duplicate is transferred to the AntlrInputStream.
+ pANTLR3_UINT8 input_duplicate = (pANTLR3_UINT8) strdup(input.c_str());
+
+ if( input_duplicate == NULL ) {
+ throw InputStreamException("Couldn't initialize string input: '" + input + "'");
+ }
+
+ pANTLR3_INPUT_STREAM inputStream = newAntrl3InPlaceStream(input_duplicate, (uint32_t)input_size, name);
+ if( inputStream==NULL ) {
+ throw InputStreamException("Couldn't initialize string input: '" + input + "'");
+ }
+ return new AntlrInputStream( name, inputStream, false, input_duplicate );
+}
+
+AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
+ using namespace language::input;
+
+ AntlrInput* input;
+
+ switch(lang) {
+ case LANG_CVC4: {
+ input = new CvcInput(inputStream);
+ break;
}
- if( d_input == NULL ) {
- throw ParserException("Couldn't open file: " + filename);
+ case LANG_SMTLIB_V1:
+ input = new Smt1Input(inputStream);
+ break;
+
+ case LANG_SMTLIB_V2_0:
+ case LANG_SMTLIB_V2_5:
+ input = new Smt2Input(inputStream, lang);
+ break;
+
+ case LANG_SYGUS:
+ input = new SygusInput(inputStream);
+ break;
+
+ case LANG_TPTP:
+ input = new TptpInput(inputStream);
+ break;
+
+ default:
+ std::stringstream ss;
+ ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
+ throw InputStreamException(ss.str());
}
+
+ return input;
+}
+
+AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
+ Input(inputStream),
+ d_lookahead(lookahead),
+ d_lexer(NULL),
+ d_parser(NULL),
+ d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
+ d_tokenBuffer(NULL) {
}
/*
}
*/
-AntlrInput::AntlrInput(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
+/*
+AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
Input(exprManager,name),
d_lookahead(lookahead),
d_lexer(NULL),
d_parser(NULL),
d_tokenStream(NULL) {
+
char* inputStr = strdup(input.c_str());
char* nameStr = strdup(name.c_str());
if( inputStr==NULL || nameStr==NULL ) {
throw ParserException("Couldn't initialize string input: '" + input + "'");
}
- d_input = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
- if( d_input == NULL ) {
+ d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
+ if( d_inputStream == NULL ) {
throw ParserException("Couldn't create input stream for string: '" + input + "'");
}
+
}
+*/
AntlrInput::~AntlrInput() {
- d_tokenStream->free(d_tokenStream);
- d_input->close(d_input);
+ BoundedTokenBufferFree(d_tokenBuffer);
}
-pANTLR3_INPUT_STREAM AntlrInput::getInputStream() {
- return d_input;
+pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
+ return d_tokenBuffer->commonTstream;
}
-pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
- return d_tokenStream;
+void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
+ pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
+ assert(lexer!=NULL);
+ Parser *parser = (Parser*)(lexer->super);
+ assert(parser!=NULL);
+ AntlrInput *input = (AntlrInput*) parser->getInput();
+ assert(input!=NULL);
+
+ /* Call the error display routine *if* there's not already a
+ * parse error pending. If a parser error is pending, this
+ * error is probably less important, so we just drop it. */
+ if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
+ input->parseError("Error finding next token.");
+ }
+}
+
+void AntlrInput::warning(const std::string& message) {
+ Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
+}
+
+
+
+/**
+ * characters considered part of a simple symbol in SMTLIB.
+ *
+ * TODO: Ideally this code shouldn't be smtlib specific (should work
+ * with CVC language too), but this per-language specialization has
+ * been left for a later point.
+ */
+inline bool isSimpleChar(char ch) {
+ return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
+}
+
+size_t wholeWordMatch(string input, string pattern, bool (*isWordChar)(char)) {
+ size_t st = 0;
+ size_t N = input.size();
+ while(st < N) {
+ while( st < N && (*isWordChar)(input[st]) == false ) st++;
+ size_t en = st;
+ while(en + 1 < N && (*isWordChar)(input[en + 1]) == true) en++;
+ if(en - st + 1 == pattern.size()) {
+ bool match = true;
+ for(size_t i = 0; match && i < pattern.size(); ++i) {
+ match &= (pattern[i] == input[st+i]);
+ }
+ if(match == true) {
+ return st;
+ }
+ }
+ st = en + 1;
+ }
+ return string::npos;
+}
+
+/**
+ * Gets part of original input and tries to visually hint where the
+ * error might be.
+ *
+ * Something like:
+ *
+ * ...nd (= alpha beta) (= beta delta))
+ * ^
+ *
+ * Implementation (as on 2014/04/24):
+ *
+ * > if suggested pointer by lexer is under a "simple char", move to
+ * start of the word and print pointer there.
+ *
+ * > in the other case, it tries to find the nearest word in the error
+ * message passed along. if it can't find it, we don't add this
+ * visual hint, as experimentally position suggested by lexer was
+ * found to be totally unhelpful. (TODO: fix this upstream to
+ * improve)
+ */
+std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
+{
+ // Is it a multi-line message
+ bool multilineMessage = (message.find('\n') != string::npos);
+ // Useful only if it is a multi-line message
+ int firstLineEnd = message.find('\n');
+
+ std::ostringstream ss, slicess;
+
+ // Keep first line of message
+ if(multilineMessage) {
+ ss << message.substr(0, firstLineEnd) << endl << endl;
+ } else {
+ ss << message << endl << endl;
+ }
+
+ int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
+ int posSliceEnd = posSliceStart + 70;
+ int caretPos = 0;
+ int caretPosExtra = 0; // for inital intendation, epilipses etc.
+
+ ss << " "; caretPosExtra += 2;
+ if(posSliceStart > 0) {
+ ss << "..."; caretPosExtra += 3;
+ }
+
+ for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
+ if(i == posSliceEnd) {
+ ss << "...";
+ break;
+ }
+ if(i < charPositionInLine) { caretPos++; }
+
+ if(!isprint(lineStart[i])) {
+ // non-printable character, something wrong, bail out
+ return message;
+ }
+
+ ss << (lineStart[i]);
+ slicess << (lineStart[i]);
+ }
+
+ // adjust position of caret, based on slice and message
+ {
+ int caretPosOrig = caretPos;
+ string slice = slicess.str();
+ if(isSimpleChar(slice[caretPos])) {
+ // if alphanumeric, try to go to beginning of word/number
+ while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
+ if(caretPos == 0 && posSliceStart > 0) {
+ // reached start and this is not really the start? bail out
+ return message;
+ } else {
+ // likely it is also in original message? if so, very likely
+ // we found the right place
+ string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
+ size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
+ Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
+ if( matchLoc != string::npos ) {
+ Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
+ }
+ }
+ } else {
+ bool foundCaretPos = false;
+
+ for(int tries = 0; tries < 2 && caretPos > 0 && !foundCaretPos; ++tries) {
+ // go to nearest alphanumeric string (before current position),
+ // see if that word can be found in original message. If so,
+ // point to that, else keep pointer where it was.
+ int nearestWordEn = caretPos - 1;
+ while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
+ --nearestWordEn;
+ }
+ if(isSimpleChar(slice[nearestWordEn])) {
+ int nearestWordSt = nearestWordEn;
+ while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
+ --nearestWordSt;
+ }
+ string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
+ size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
+ Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
+ Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
+ if( matchLoc != string::npos ) {
+ Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
+ << nearestWordSt << std::endl;
+ foundCaretPos = true;
+ }
+ caretPos = nearestWordSt;
+ }
+ }
+ if( !foundCaretPos) {
+ // this doesn't look good. caret generally getting printed
+ // at unhelpful positions. improve upstream?
+ return message;
+ }
+ }
+ caretPos += caretPosExtra;
+ }// end of caret position computation/heuristics
+
+ ss << std::endl;
+ while( caretPos-- > 0 ) {
+ ss << ' ';
+ }
+ ss << '^' << endl;
+ if(multilineMessage) {
+ ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
+ }
+ return ss.str();
}
-void AntlrInput::parseError(const std::string& message)
- throw (ParserException) {
+void AntlrInput::parseError(const std::string& message, bool eofException)
+ throw (ParserException) {
+
+ string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
+ d_lexer->getCharPositionInLine(d_lexer),
+ message);
+
Debug("parser") << "Throwing exception: "
- << getParserState()->getFilename() << ":"
+ << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
<< d_lexer->getLine(d_lexer) << "."
<< d_lexer->getCharPositionInLine(d_lexer) << ": "
- << message << endl;
- throw ParserException(message, getParserState()->getFilename(),
- d_lexer->getLine(d_lexer),
- d_lexer->getCharPositionInLine(d_lexer));
+ << updatedMessage << endl;
+ if(eofException) {
+ throw ParserEndOfFileException(message,
+ (const char*)d_lexer->rec->state->tokSource->fileName->chars,
+ d_lexer->getLine(d_lexer),
+ d_lexer->getCharPositionInLine(d_lexer));
+ } else {
+ throw ParserException(updatedMessage,
+ (const char*)d_lexer->rec->state->tokSource->fileName->chars,
+ d_lexer->getLine(d_lexer),
+ d_lexer->getCharPositionInLine(d_lexer));
+ }
}
-void AntlrInput::setLexer(pANTLR3_LEXER pLexer) {
+void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
d_lexer = pLexer;
pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
}
/* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
- pTokenFactory = BoundedTokenFactoryNew(d_input, 2*d_lookahead);
+ pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
if( pTokenFactory == NULL ) {
- throw ParserException("Couldn't create token factory.");
+ throw InputStreamException("Couldn't create token factory.");
}
d_lexer->rec->state->tokFactory = pTokenFactory;
pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
if( buffer == NULL ) {
- throw ParserException("Couldn't create token buffer.");
+ throw InputStreamException("Couldn't create token buffer.");
}
- d_tokenStream = buffer->commonTstream;
+ d_tokenBuffer = buffer;
+
+ // Override default lexer error reporting
+ d_lexer->rec->reportError = &lexerError;
+ // Override default nextToken function, just to prevent exceptions escaping.
+ d_lexer->rec->state->tokSource->nextToken = &nextToken;
}
-void AntlrInput::setParser(pANTLR3_PARSER pParser) {
+void AntlrInput::setParser(Parser& parser) {
+ // ANTLR isn't using super in the lexer or the parser, AFAICT.
+ // We could also use @lexer/parser::context to add a field to the generated
+ // objects, but then it would have to be declared separately in every
+ // language's grammar and we'd have to in the address of the field anyway.
+ d_lexer->super = &parser;
+ d_parser->super = &parser;
+}
+
+void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
d_parser = pParser;
- // ANTLR isn't using super, AFAICT.
- // We could also use @parser::context to add a field to the generated parser, but then
- // it would have to be declared separately in every input's grammar and we'd have to
- // pass it in as an address anyway.
- d_parser->super = getParserState();
// d_parser->rec->match = &match;
d_parser->rec->reportError = &reportError;
/* Don't try to recover from a parse error. */
d_parser->rec->mismatch;
}
-
}/* CVC4::parser namespace */
}/* CVC4 namespace */