src/parser/antlr_input.cpp

   1 /*********************                                                        */
   2 /*! \file antlr_input.cpp
   3  ** \verbatim
   4  ** Original author: Christopher L. Conway
   5  ** Major contributors: Morgan Deters, Kshitij Bansal
   6  ** Minor contributors (to current version): Francois Bobot
   7  ** This file is part of the CVC4 project.
   8  ** Copyright (c) 2009-2014  New York University and The University of Iowa
   9  ** See the file COPYING in the top-level source directory for licensing
  10  ** information.\endverbatim
  11  **
  12  ** \brief A super-class for ANTLR-generated input language parsers.
  13  **
  14  ** A super-class for ANTLR-generated input language parsers
  15  **/
  16
  17 #include <limits.h>
  18 #include <antlr3.h>
  19 #include <stdint.h>
  20
  21 #include "parser/antlr_input.h"
  22 #include "parser/input.h"
  23 #include "parser/bounded_token_buffer.h"
  24 #include "parser/bounded_token_factory.h"
  25 #include "parser/antlr_line_buffered_input.h"
  26 #include "parser/memory_mapped_input_buffer.h"
  27 #include "parser/parser_exception.h"
  28 #include "parser/parser.h"
  29
  30 #include "expr/command.h"
  31 #include "expr/type.h"
  32 #include "parser/cvc/cvc_input.h"
  33 #include "parser/smt1/smt1_input.h"
  34 #include "parser/smt2/smt2_input.h"
  35 #include "parser/tptp/tptp_input.h"
  36 #include "util/output.h"
  37
  38 using namespace std;
  39 using namespace CVC4;
  40 using namespace CVC4::parser;
  41 using namespace CVC4::kind;
  42
  43 namespace CVC4 {
  44 namespace parser {
  45
  46 AntlrInputStream::AntlrInputStream(std::string name,
  47                                    pANTLR3_INPUT_STREAM input,
  48                                    bool fileIsTemporary) :
  49   InputStream(name, fileIsTemporary),
  50   d_input(input) {
  51   assert( input != NULL );
  52   input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
  53 }
  54
  55 AntlrInputStream::~AntlrInputStream() {
  56   d_input->free(d_input);
  57 }
  58
  59 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
  60   return d_input;
  61 }
  62
  63 AntlrInputStream*
  64 AntlrInputStream::newFileInputStream(const std::string& name,
  65                                      bool useMmap)
  66   throw (InputStreamException) {
  67 #ifdef _WIN32
  68   if(useMmap) {
  69     useMmap = false;
  70   }
  71 #endif
  72   pANTLR3_INPUT_STREAM input = NULL;
  73   if(useMmap) {
  74     input = MemoryMappedInputBufferNew(name);
  75   } else {
  76     // libantlr3c v3.2 isn't source-compatible with v3.4
  77 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  78     input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) name.c_str());
  79 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  80     input = antlr3FileStreamNew((pANTLR3_UINT8) name.c_str(), ANTLR3_ENC_8BIT);
  81 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  82   }
  83   if(input == NULL) {
  84     throw InputStreamException("Couldn't open file: " + name);
  85   }
  86   return new AntlrInputStream( name, input );
  87 }
  88
  89 AntlrInputStream*
  90 AntlrInputStream::newStreamInputStream(std::istream& input,
  91                                        const std::string& name,
  92                                        bool lineBuffered)
  93   throw (InputStreamException) {
  94
  95   pANTLR3_INPUT_STREAM inputStream = NULL;
  96
  97   if(lineBuffered) {
  98 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  99     inputStream =
 100       antlr3LineBufferedStreamNew(input,
 101                                   0,
 102                                   (pANTLR3_UINT8) strdup(name.c_str()));
 103 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 104     inputStream =
 105       antlr3LineBufferedStreamNew(input,
 106                                   ANTLR3_ENC_8BIT,
 107                                   (pANTLR3_UINT8) strdup(name.c_str()));
 108 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 109   } else {
 110
 111     // Since these are all NULL on entry, realloc will be called
 112     char *basep = NULL, *boundp = NULL, *cp = NULL;
 113     /* 64KB seems like a reasonable default size. */
 114     size_t bufSize = 0x10000;
 115
 116     /* Keep going until we can't go no more. */
 117     while( !input.eof() && !input.fail() ) {
 118
 119       if( cp == boundp ) {
 120         /* We ran out of room in the buffer. Realloc at double the size. */
 121         ptrdiff_t offset = cp - basep;
 122         basep = (char *) realloc(basep, bufSize);
 123         if( basep == NULL ) {
 124           throw InputStreamException("Failed buffering input stream: " + name);
 125         }
 126         cp = basep + offset;
 127         boundp = basep + bufSize;
 128         bufSize *= 2;
 129       }
 130
 131       /* Read as much as we have room for. */
 132       input.read( cp, boundp - cp );
 133       cp += input.gcount();
 134     }
 135
 136     /* Make sure the fail bit didn't get set. */
 137     if( !input.eof() ) {
 138       throw InputStreamException("Stream input failed: " + name);
 139     }
 140
 141     /* Create an ANTLR input backed by the buffer. */
 142 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
 143     inputStream =
 144       antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) basep,
 145                                         cp - basep,
 146                                         (pANTLR3_UINT8) strdup(name.c_str()));
 147 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 148     inputStream =
 149       antlr3StringStreamNew((pANTLR3_UINT8) basep,
 150                             ANTLR3_ENC_8BIT,
 151                             cp - basep,
 152                             (pANTLR3_UINT8) strdup(name.c_str()));
 153 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 154
 155   }
 156
 157   if( inputStream == NULL ) {
 158     throw InputStreamException("Couldn't initialize input: " + name);
 159   }
 160
 161   return new AntlrInputStream( name, inputStream );
 162 }
 163
 164
 165 AntlrInputStream*
 166 AntlrInputStream::newStringInputStream(const std::string& input,
 167                                        const std::string& name)
 168   throw (InputStreamException) {
 169   char* inputStr = strdup(input.c_str());
 170   char* nameStr = strdup(name.c_str());
 171   assert( inputStr!=NULL && nameStr!=NULL );
 172 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
 173   pANTLR3_INPUT_STREAM inputStream =
 174       antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) inputStr,
 175                                         input.size(),
 176                                         (pANTLR3_UINT8) nameStr);
 177 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 178   pANTLR3_INPUT_STREAM inputStream =
 179       antlr3StringStreamNew((pANTLR3_UINT8) inputStr,
 180                             ANTLR3_ENC_8BIT,
 181                             input.size(),
 182                             (pANTLR3_UINT8) nameStr);
 183 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 184   if( inputStream==NULL ) {
 185     throw InputStreamException("Couldn't initialize string input: '" + input + "'");
 186   }
 187   return new AntlrInputStream( name, inputStream );
 188 }
 189
 190 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
 191   using namespace language::input;
 192
 193   AntlrInput* input;
 194
 195   switch(lang) {
 196   case LANG_CVC4: {
 197     input = new CvcInput(inputStream);
 198     break;
 199   }
 200   case LANG_SMTLIB_V1:
 201     input = new Smt1Input(inputStream);
 202     break;
 203
 204   case LANG_SMTLIB_V2_0:
 205   case LANG_SMTLIB_V2_5:
 206     input = new Smt2Input(inputStream, lang);
 207     break;
 208
 209   case LANG_TPTP:
 210     input = new TptpInput(inputStream);
 211     break;
 212
 213   default:
 214     std::stringstream ss;
 215     ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
 216     throw InputStreamException(ss.str());
 217   }
 218
 219   return input;
 220 }
 221
 222 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
 223     Input(inputStream),
 224     d_lookahead(lookahead),
 225     d_lexer(NULL),
 226     d_parser(NULL),
 227     d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
 228     d_tokenBuffer(NULL) {
 229 }
 230
 231 /*
 232 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
 233   Parser(exprManager,name),
 234   d_lookahead(lookahead) {
 235
 236 }
 237 */
 238
 239 /*
 240 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
 241   Input(exprManager,name),
 242   d_lookahead(lookahead),
 243   d_lexer(NULL),
 244   d_parser(NULL),
 245   d_tokenStream(NULL) {
 246
 247   char* inputStr = strdup(input.c_str());
 248   char* nameStr = strdup(name.c_str());
 249   if( inputStr==NULL || nameStr==NULL ) {
 250     throw ParserException("Couldn't initialize string input: '" + input + "'");
 251   }
 252   d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
 253   if( d_inputStream == NULL ) {
 254     throw ParserException("Couldn't create input stream for string: '" + input + "'");
 255   }
 256
 257 }
 258 */
 259
 260 AntlrInput::~AntlrInput() {
 261   BoundedTokenBufferFree(d_tokenBuffer);
 262 }
 263
 264 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
 265   return d_tokenBuffer->commonTstream;
 266 }
 267
 268 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
 269   pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
 270   assert(lexer!=NULL);
 271   Parser *parser = (Parser*)(lexer->super);
 272   assert(parser!=NULL);
 273   AntlrInput *input = (AntlrInput*) parser->getInput();
 274   assert(input!=NULL);
 275
 276   /* Call the error display routine *if* there's not already a
 277    * parse error pending.  If a parser error is pending, this
 278    * error is probably less important, so we just drop it. */
 279   if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
 280     input->parseError("Error finding next token.");
 281   }
 282 }
 283
 284 void AntlrInput::warning(const std::string& message) {
 285   Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
 286 }
 287
 288
 289 /**
 290  * characters considered part of a simple symbol in SMTLIB.
 291  *
 292  * TODO: Ideally this code shouldn't be smtlib specific (should work
 293  * with CVC language too), but this per-language specialization has
 294  * been left for a later point.
 295  */
 296 inline bool isSimpleChar(char ch) {
 297   return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
 298 }
 299
 300 size_t wholeWordMatch(string input, string pattern, bool (*isWordChar)(char)) {
 301   size_t st = 0;
 302   size_t N = input.size();
 303   while(st < N) {
 304     while( st < N && (*isWordChar)(input[st])  == false ) st++;
 305     size_t en = st;
 306     while(en + 1 < N && (*isWordChar)(input[en + 1]) == true) en++;
 307     if(en - st + 1 == pattern.size()) {
 308       bool match = true;
 309       for(size_t i = 0; match && i < pattern.size(); ++i) {
 310         match &= (pattern[i] == input[st+i]);
 311       }
 312       if(match == true) {
 313         return st;
 314       }
 315     }
 316     st = en + 1;
 317   }
 318   return string::npos;
 319 }
 320
 321 /**
 322  * Gets part of original input and tries to visually hint where the
 323  * error might be.
 324  *
 325  * Something like:
 326  *
 327  *   ...nd (= alpha beta) (= beta delta))
 328  *                                ^
 329  *
 330  * Implementation (as on 2014/04/24):
 331  *
 332  * > if suggested pointer by lexer is under a "simple char", move to
 333  *   start of the word and print pointer there.
 334  *
 335  * > in the other case, it tries to find the nearest word in the error
 336  *   message passed along. if it can't find it, we don't add this
 337  *   visual hint, as experimentally position suggested by lexer was
 338  *   found to be totally unhelpful. (TODO: fix this upstream to
 339  *   improve)
 340  */
 341 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
 342 {
 343   // Is it a multi-line message
 344   bool multilineMessage = (message.find('\n') != string::npos);
 345   // Useful only if it is a multi-line message
 346   int firstLineEnd = message.find('\n');
 347
 348   std::ostringstream ss, slicess;
 349
 350   // Keep first line of message
 351   if(multilineMessage) {
 352     ss << message.substr(0, firstLineEnd) << endl << endl;
 353   } else {
 354     ss << message << endl << endl;
 355   }
 356
 357   int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
 358   int posSliceEnd = posSliceStart + 70;
 359   int caretPos = 0;
 360   int caretPosExtra = 0; // for inital intendation, epilipses etc.
 361
 362   ss << "  "; caretPosExtra += 2;
 363   if(posSliceStart > 0) {
 364     ss << "..."; caretPosExtra += 3;
 365   }
 366
 367   for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
 368     if(i == posSliceEnd) {
 369       ss << "...";
 370       break;
 371     }
 372     if(i < charPositionInLine) { caretPos++; }
 373
 374     if(!isprint(lineStart[i])) {
 375       // non-printable character, something wrong, bail out
 376       return message;
 377     }
 378
 379     ss << (lineStart[i]);
 380     slicess << (lineStart[i]);
 381   }
 382
 383   // adjust position of caret, based on slice and message
 384   {
 385     int caretPosOrig = caretPos;
 386     string slice = slicess.str();
 387     if(isSimpleChar(slice[caretPos])) {
 388       // if alphanumeric, try to go to beginning of word/number
 389       while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
 390       if(caretPos == 0 && posSliceStart > 0) {
 391         // reached start and this is not really the start? bail out
 392         return message;
 393       } else {
 394         // likely it is also in original message? if so, very likely
 395         // we found the right place
 396         string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
 397         size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 398         Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 399         if( matchLoc != string::npos ) {
 400           Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
 401         }
 402       }
 403     } else {
 404       bool foundCaretPos = false;
 405
 406       for(int tries = 0; tries < 2 && caretPos > 0 && !foundCaretPos; ++tries) {
 407         // go to nearest alphanumeric string (before current position),
 408         // see if that word can be found in original message. If so,
 409         // point to that, else keep pointer where it was.
 410         int nearestWordEn = caretPos - 1;
 411         while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
 412           --nearestWordEn;
 413         }
 414         if(isSimpleChar(slice[nearestWordEn])) {
 415           int nearestWordSt = nearestWordEn;
 416           while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
 417             --nearestWordSt;
 418           }
 419           string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
 420           size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 421           Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
 422           Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 423           if( matchLoc != string::npos ) {
 424             Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
 425                                     << nearestWordSt << std::endl;
 426             foundCaretPos = true;
 427           }
 428           caretPos = nearestWordSt;
 429         }
 430       }
 431       if( !foundCaretPos) {
 432         // this doesn't look good. caret generally getting printed
 433         // at unhelpful positions. improve upstream?
 434         return message;
 435       }
 436     }
 437     caretPos += caretPosExtra;
 438   }// end of caret position computation/heuristics
 439
 440   ss << std::endl;
 441   while( caretPos-- > 0 ) {
 442     ss << ' ';
 443   }
 444   ss << '^' << endl;
 445   if(multilineMessage) {
 446      ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
 447   }
 448   return ss.str();
 449 }
 450
 451 void AntlrInput::parseError(const std::string& message, bool eofException)
 452   throw (ParserException) {
 453
 454   string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
 455                                            d_lexer->getCharPositionInLine(d_lexer),
 456                                            message);
 457
 458   Debug("parser") << "Throwing exception: "
 459       << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
 460       << d_lexer->getLine(d_lexer) << "."
 461       << d_lexer->getCharPositionInLine(d_lexer) << ": "
 462       << updatedMessage << endl;
 463   if(eofException) {
 464     throw ParserEndOfFileException(message,
 465                                    (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 466                                    d_lexer->getLine(d_lexer),
 467                                    d_lexer->getCharPositionInLine(d_lexer));
 468   } else {
 469     throw ParserException(updatedMessage,
 470                           (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 471                           d_lexer->getLine(d_lexer),
 472                           d_lexer->getCharPositionInLine(d_lexer));
 473   }
 474 }
 475
 476
 477 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
 478   d_lexer = pLexer;
 479
 480   pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
 481   if( pTokenFactory != NULL ) {
 482     pTokenFactory->close(pTokenFactory);
 483   }
 484
 485   /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
 486   pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
 487   if( pTokenFactory == NULL ) {
 488     throw InputStreamException("Couldn't create token factory.");
 489   }
 490   d_lexer->rec->state->tokFactory = pTokenFactory;
 491
 492   pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
 493   if( buffer == NULL ) {
 494     throw InputStreamException("Couldn't create token buffer.");
 495   }
 496
 497   d_tokenBuffer = buffer;
 498
 499   // Override default lexer error reporting
 500   d_lexer->rec->reportError = &lexerError;
 501   // Override default nextToken function, just to prevent exceptions escaping.
 502   d_lexer->rec->state->tokSource->nextToken = &nextToken;
 503 }
 504
 505 void AntlrInput::setParser(Parser& parser) {
 506   // ANTLR isn't using super in the lexer or the parser, AFAICT.
 507   // We could also use @lexer/parser::context to add a field to the generated
 508   // objects, but then it would have to be declared separately in every
 509   // language's grammar and we'd have to in the address of the field anyway.
 510   d_lexer->super = &parser;
 511   d_parser->super = &parser;
 512 }
 513
 514 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
 515   d_parser = pParser;
 516 //  d_parser->rec->match = &match;
 517   d_parser->rec->reportError = &reportError;
 518   /* Don't try to recover from a parse error. */
 519   // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
 520   d_parser->rec->recoverFromMismatchedToken =
 521       (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
 522       d_parser->rec->mismatch;
 523 }
 524
 525 }/* CVC4::parser namespace */
 526 }/* CVC4 namespace */