src/parser/antlr_input.cpp

   1 /*********************                                                        */
   2 /*! \file antlr_input.cpp
   3  ** \verbatim
   4  ** Original author: Christopher L. Conway
   5  ** Major contributors: Morgan Deters, Kshitij Bansal
   6  ** Minor contributors (to current version): Francois Bobot
   7  ** This file is part of the CVC4 project.
   8  ** Copyright (c) 2009-2014  New York University and The University of Iowa
   9  ** See the file COPYING in the top-level source directory for licensing
  10  ** information.\endverbatim
  11  **
  12  ** \brief A super-class for ANTLR-generated input language parsers.
  13  **
  14  ** A super-class for ANTLR-generated input language parsers
  15  **/
  16
  17 #include <limits.h>
  18 #include <antlr3.h>
  19 #include <stdint.h>
  20
  21 #include "parser/antlr_input.h"
  22 #include "parser/input.h"
  23 #include "parser/bounded_token_buffer.h"
  24 #include "parser/bounded_token_factory.h"
  25 #include "parser/antlr_line_buffered_input.h"
  26 #include "parser/memory_mapped_input_buffer.h"
  27 #include "parser/parser_exception.h"
  28 #include "parser/parser.h"
  29
  30 #include "expr/command.h"
  31 #include "expr/type.h"
  32 #include "parser/cvc/cvc_input.h"
  33 #include "parser/smt1/smt1_input.h"
  34 #include "parser/smt2/smt2_input.h"
  35 #include "parser/tptp/tptp_input.h"
  36 #include "util/output.h"
  37
  38 using namespace std;
  39 using namespace CVC4;
  40 using namespace CVC4::parser;
  41 using namespace CVC4::kind;
  42
  43 namespace CVC4 {
  44 namespace parser {
  45
  46 AntlrInputStream::AntlrInputStream(std::string name,
  47                                    pANTLR3_INPUT_STREAM input,
  48                                    bool fileIsTemporary) :
  49   InputStream(name, fileIsTemporary),
  50   d_input(input) {
  51   assert( input != NULL );
  52   input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
  53 }
  54
  55 AntlrInputStream::~AntlrInputStream() {
  56   d_input->free(d_input);
  57 }
  58
  59 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
  60   return d_input;
  61 }
  62
  63 AntlrInputStream*
  64 AntlrInputStream::newFileInputStream(const std::string& name,
  65                                      bool useMmap)
  66   throw (InputStreamException) {
  67 #ifdef _WIN32
  68   if(useMmap) {
  69     useMmap = false;
  70   }
  71 #endif
  72   pANTLR3_INPUT_STREAM input = NULL;
  73   if(useMmap) {
  74     input = MemoryMappedInputBufferNew(name);
  75   } else {
  76     // libantlr3c v3.2 isn't source-compatible with v3.4
  77 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  78     input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) name.c_str());
  79 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  80     input = antlr3FileStreamNew((pANTLR3_UINT8) name.c_str(), ANTLR3_ENC_8BIT);
  81 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  82   }
  83   if(input == NULL) {
  84     throw InputStreamException("Couldn't open file: " + name);
  85   }
  86   return new AntlrInputStream( name, input );
  87 }
  88
  89 AntlrInputStream*
  90 AntlrInputStream::newStreamInputStream(std::istream& input,
  91                                        const std::string& name,
  92                                        bool lineBuffered)
  93   throw (InputStreamException) {
  94
  95   pANTLR3_INPUT_STREAM inputStream = NULL;
  96
  97   if(lineBuffered) {
  98 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  99     inputStream =
 100       antlr3LineBufferedStreamNew(input,
 101                                   0,
 102                                   (pANTLR3_UINT8) strdup(name.c_str()));
 103 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 104     inputStream =
 105       antlr3LineBufferedStreamNew(input,
 106                                   ANTLR3_ENC_8BIT,
 107                                   (pANTLR3_UINT8) strdup(name.c_str()));
 108 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 109   } else {
 110
 111     // Since these are all NULL on entry, realloc will be called
 112     char *basep = NULL, *boundp = NULL, *cp = NULL;
 113     /* 64KB seems like a reasonable default size. */
 114     size_t bufSize = 0x10000;
 115
 116     /* Keep going until we can't go no more. */
 117     while( !input.eof() && !input.fail() ) {
 118
 119       if( cp == boundp ) {
 120         /* We ran out of room in the buffer. Realloc at double the size. */
 121         ptrdiff_t offset = cp - basep;
 122         basep = (char *) realloc(basep, bufSize);
 123         if( basep == NULL ) {
 124           throw InputStreamException("Failed buffering input stream: " + name);
 125         }
 126         cp = basep + offset;
 127         boundp = basep + bufSize;
 128         bufSize *= 2;
 129       }
 130
 131       /* Read as much as we have room for. */
 132       input.read( cp, boundp - cp );
 133       cp += input.gcount();
 134     }
 135
 136     /* Make sure the fail bit didn't get set. */
 137     if( !input.eof() ) {
 138       throw InputStreamException("Stream input failed: " + name);
 139     }
 140
 141     /* Create an ANTLR input backed by the buffer. */
 142 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
 143     inputStream =
 144       antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) basep,
 145                                         cp - basep,
 146                                         (pANTLR3_UINT8) strdup(name.c_str()));
 147 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 148     inputStream =
 149       antlr3StringStreamNew((pANTLR3_UINT8) basep,
 150                             ANTLR3_ENC_8BIT,
 151                             cp - basep,
 152                             (pANTLR3_UINT8) strdup(name.c_str()));
 153 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 154
 155   }
 156
 157   if( inputStream == NULL ) {
 158     throw InputStreamException("Couldn't initialize input: " + name);
 159   }
 160
 161   return new AntlrInputStream( name, inputStream );
 162 }
 163
 164
 165 AntlrInputStream*
 166 AntlrInputStream::newStringInputStream(const std::string& input,
 167                                        const std::string& name)
 168   throw (InputStreamException) {
 169   char* inputStr = strdup(input.c_str());
 170   char* nameStr = strdup(name.c_str());
 171   assert( inputStr!=NULL && nameStr!=NULL );
 172 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
 173   pANTLR3_INPUT_STREAM inputStream =
 174       antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) inputStr,
 175                                         input.size(),
 176                                         (pANTLR3_UINT8) nameStr);
 177 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 178   pANTLR3_INPUT_STREAM inputStream =
 179       antlr3StringStreamNew((pANTLR3_UINT8) inputStr,
 180                             ANTLR3_ENC_8BIT,
 181                             input.size(),
 182                             (pANTLR3_UINT8) nameStr);
 183 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 184   if( inputStream==NULL ) {
 185     throw InputStreamException("Couldn't initialize string input: '" + input + "'");
 186   }
 187   return new AntlrInputStream( name, inputStream );
 188 }
 189
 190 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
 191   using namespace language::input;
 192
 193   AntlrInput* input;
 194
 195   switch(lang) {
 196   case LANG_CVC4: {
 197     input = new CvcInput(inputStream);
 198     break;
 199   }
 200   case LANG_SMTLIB_V1:
 201     input = new Smt1Input(inputStream);
 202     break;
 203
 204   case LANG_SMTLIB_V2:
 205     input = new Smt2Input(inputStream);
 206     break;
 207
 208   case LANG_TPTP:
 209     input = new TptpInput(inputStream);
 210     break;
 211
 212   default:
 213     std::stringstream ss;
 214     ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
 215     throw InputStreamException(ss.str());
 216   }
 217
 218   return input;
 219 }
 220
 221 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
 222     Input(inputStream),
 223     d_lookahead(lookahead),
 224     d_lexer(NULL),
 225     d_parser(NULL),
 226     d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
 227     d_tokenBuffer(NULL) {
 228 }
 229
 230 /*
 231 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
 232   Parser(exprManager,name),
 233   d_lookahead(lookahead) {
 234
 235 }
 236 */
 237
 238 /*
 239 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
 240   Input(exprManager,name),
 241   d_lookahead(lookahead),
 242   d_lexer(NULL),
 243   d_parser(NULL),
 244   d_tokenStream(NULL) {
 245
 246   char* inputStr = strdup(input.c_str());
 247   char* nameStr = strdup(name.c_str());
 248   if( inputStr==NULL || nameStr==NULL ) {
 249     throw ParserException("Couldn't initialize string input: '" + input + "'");
 250   }
 251   d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
 252   if( d_inputStream == NULL ) {
 253     throw ParserException("Couldn't create input stream for string: '" + input + "'");
 254   }
 255
 256 }
 257 */
 258
 259 AntlrInput::~AntlrInput() {
 260   BoundedTokenBufferFree(d_tokenBuffer);
 261 }
 262
 263 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
 264   return d_tokenBuffer->commonTstream;
 265 }
 266
 267 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
 268   pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
 269   assert(lexer!=NULL);
 270   Parser *parser = (Parser*)(lexer->super);
 271   assert(parser!=NULL);
 272   AntlrInput *input = (AntlrInput*) parser->getInput();
 273   assert(input!=NULL);
 274
 275   /* Call the error display routine *if* there's not already a
 276    * parse error pending.  If a parser error is pending, this
 277    * error is probably less important, so we just drop it. */
 278   if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
 279     input->parseError("Error finding next token.");
 280   }
 281 }
 282
 283 void AntlrInput::warning(const std::string& message) {
 284   Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
 285 }
 286
 287
 288 /**
 289  * characters considered part of a simple symbol in SMTLIB.
 290  *
 291  * TODO: Ideally this code shouldn't be smtlib specific (should work
 292  * with CVC language too), but this per-language specialization has
 293  * been left for a later point.
 294  */
 295 inline bool isSimpleChar(char ch) {
 296   return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
 297 }
 298
 299 size_t wholeWordMatch(string input, string pattern, bool (*isWordChar)(char)) {
 300   size_t st = 0;
 301   size_t N = input.size();
 302   while(st < N) {
 303     while( st < N && (*isWordChar)(input[st])  == false ) st++;
 304     size_t en = st;
 305     while(en + 1 < N && (*isWordChar)(input[en + 1]) == true) en++;
 306     if(en - st + 1 == pattern.size()) {
 307       bool match = true;
 308       for(size_t i = 0; match && i < pattern.size(); ++i) {
 309         match &= (pattern[i] == input[st+i]);
 310       }
 311       if(match == true) {
 312         return st;
 313       }
 314     }
 315     st = en + 1;
 316   }
 317   return string::npos;
 318 }
 319
 320 /**
 321  * Gets part of original input and tries to visually hint where the
 322  * error might be.
 323  *
 324  * Something like:
 325  *
 326  *   ...nd (= alpha beta) (= beta delta))
 327  *                                ^
 328  *
 329  * Implementation (as on 2014/04/24):
 330  *
 331  * > if suggested pointer by lexer is under a "simple char", move to
 332  *   start of the word and print pointer there.
 333  *
 334  * > in the other case, it tries to find the nearest word in the error
 335  *   message passed along. if it can't find it, we don't add this
 336  *   visual hint, as experimentally position suggested by lexer was
 337  *   found to be totally unhelpful. (TODO: fix this upstream to
 338  *   improve)
 339  */
 340 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
 341 {
 342   // Is it a multi-line message
 343   bool multilineMessage = (message.find('\n') != string::npos);
 344   // Useful only if it is a multi-line message
 345   int firstLineEnd = message.find('\n');
 346
 347   std::ostringstream ss, slicess;
 348
 349   // Keep first line of message
 350   if(multilineMessage) {
 351     ss << message.substr(0, firstLineEnd) << endl << endl;
 352   } else {
 353     ss << message << endl << endl;
 354   }
 355
 356   int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
 357   int posSliceEnd = posSliceStart + 70;
 358   int caretPos = 0;
 359   int caretPosExtra = 0; // for inital intendation, epilipses etc.
 360
 361   ss << "  "; caretPosExtra += 2;
 362   if(posSliceStart > 0) {
 363     ss << "..."; caretPosExtra += 3;
 364   }
 365
 366   for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
 367     if(i == posSliceEnd) {
 368       ss << "...";
 369       break;
 370     }
 371     if(i < charPositionInLine) { caretPos++; }
 372
 373     if(!isprint(lineStart[i])) {
 374       // non-printable character, something wrong, bail out
 375       return message;
 376     }
 377
 378     ss << (lineStart[i]);
 379     slicess << (lineStart[i]);
 380   }
 381
 382   // adjust position of caret, based on slice and message
 383   {
 384     int caretPosOrig = caretPos;
 385     string slice = slicess.str();
 386     if(isSimpleChar(slice[caretPos])) {
 387       // if alphanumeric, try to go to beginning of word/number
 388       while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
 389       if(caretPos == 0 && posSliceStart > 0) {
 390         // reached start and this is not really the start? bail out
 391         return message;
 392       } else {
 393         // likely it is also in original message? if so, very likely
 394         // we found the right place
 395         string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
 396         size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 397         Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 398         if( matchLoc != string::npos ) {
 399           Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
 400         }
 401       }
 402     } else {
 403       bool foundCaretPos = false;
 404
 405       for(int tries = 0; tries < 2 && caretPos > 0 && !foundCaretPos; ++tries) {
 406         // go to nearest alphanumeric string (before current position),
 407         // see if that word can be found in original message. If so,
 408         // point to that, else keep pointer where it was.
 409         int nearestWordEn = caretPos - 1;
 410         while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
 411           --nearestWordEn;
 412         }
 413         if(isSimpleChar(slice[nearestWordEn])) {
 414           int nearestWordSt = nearestWordEn;
 415           while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
 416             --nearestWordSt;
 417           }
 418           string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
 419           size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 420           Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
 421           Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 422           if( matchLoc != string::npos ) {
 423             Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
 424                                     << nearestWordSt << std::endl;
 425             foundCaretPos = true;
 426           }
 427           caretPos = nearestWordSt;
 428         }
 429       }
 430       if( !foundCaretPos) {
 431         // this doesn't look good. caret generally getting printed
 432         // at unhelpful positions. improve upstream?
 433         return message;
 434       }
 435     }
 436     caretPos += caretPosExtra;
 437   }// end of caret position computation/heuristics
 438
 439   ss << std::endl;
 440   while( caretPos-- > 0 ) {
 441     ss << ' ';
 442   }
 443   ss << '^' << endl;
 444   if(multilineMessage) {
 445      ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
 446   }
 447   return ss.str();
 448 }
 449
 450 void AntlrInput::parseError(const std::string& message, bool eofException)
 451   throw (ParserException) {
 452
 453   string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
 454                                            d_lexer->getCharPositionInLine(d_lexer),
 455                                            message);
 456
 457   Debug("parser") << "Throwing exception: "
 458       << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
 459       << d_lexer->getLine(d_lexer) << "."
 460       << d_lexer->getCharPositionInLine(d_lexer) << ": "
 461       << updatedMessage << endl;
 462   if(eofException) {
 463     throw ParserEndOfFileException(message,
 464                                    (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 465                                    d_lexer->getLine(d_lexer),
 466                                    d_lexer->getCharPositionInLine(d_lexer));
 467   } else {
 468     throw ParserException(updatedMessage,
 469                           (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 470                           d_lexer->getLine(d_lexer),
 471                           d_lexer->getCharPositionInLine(d_lexer));
 472   }
 473 }
 474
 475
 476 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
 477   d_lexer = pLexer;
 478
 479   pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
 480   if( pTokenFactory != NULL ) {
 481     pTokenFactory->close(pTokenFactory);
 482   }
 483
 484   /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
 485   pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
 486   if( pTokenFactory == NULL ) {
 487     throw InputStreamException("Couldn't create token factory.");
 488   }
 489   d_lexer->rec->state->tokFactory = pTokenFactory;
 490
 491   pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
 492   if( buffer == NULL ) {
 493     throw InputStreamException("Couldn't create token buffer.");
 494   }
 495
 496   d_tokenBuffer = buffer;
 497
 498   // Override default lexer error reporting
 499   d_lexer->rec->reportError = &lexerError;
 500   // Override default nextToken function, just to prevent exceptions escaping.
 501   d_lexer->rec->state->tokSource->nextToken = &nextToken;
 502 }
 503
 504 void AntlrInput::setParser(Parser& parser) {
 505   // ANTLR isn't using super in the lexer or the parser, AFAICT.
 506   // We could also use @lexer/parser::context to add a field to the generated
 507   // objects, but then it would have to be declared separately in every
 508   // language's grammar and we'd have to in the address of the field anyway.
 509   d_lexer->super = &parser;
 510   d_parser->super = &parser;
 511 }
 512
 513 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
 514   d_parser = pParser;
 515 //  d_parser->rec->match = &match;
 516   d_parser->rec->reportError = &reportError;
 517   /* Don't try to recover from a parse error. */
 518   // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
 519   d_parser->rec->recoverFromMismatchedToken =
 520       (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
 521       d_parser->rec->mismatch;
 522 }
 523
 524 }/* CVC4::parser namespace */
 525 }/* CVC4 namespace */