src/parser/antlr_input.cpp

   1 /*********************                                                        */
   2 /*! \file antlr_input.cpp
   3  ** \verbatim
   4  ** Original author: Christopher L. Conway
   5  ** Major contributors: Morgan Deters, Kshitij Bansal
   6  ** Minor contributors (to current version): Francois Bobot
   7  ** This file is part of the CVC4 project.
   8  ** Copyright (c) 2009-2014  New York University and The University of Iowa
   9  ** See the file COPYING in the top-level source directory for licensing
  10  ** information.\endverbatim
  11  **
  12  ** \brief A super-class for ANTLR-generated input language parsers.
  13  **
  14  ** A super-class for ANTLR-generated input language parsers
  15  **/
  16
  17 #include <limits.h>
  18 #include <antlr3.h>
  19 #include <stdint.h>
  20
  21 #include "base/output.h"
  22 #include "expr/type.h"
  23 #include "parser/antlr_input.h"
  24 #include "parser/antlr_line_buffered_input.h"
  25 #include "parser/bounded_token_buffer.h"
  26 #include "parser/bounded_token_factory.h"
  27 #include "parser/cvc/cvc_input.h"
  28 #include "parser/input.h"
  29 #include "parser/memory_mapped_input_buffer.h"
  30 #include "parser/parser.h"
  31 #include "parser/parser_exception.h"
  32 #include "parser/smt1/smt1_input.h"
  33 #include "parser/smt2/smt2_input.h"
  34 #include "parser/smt2/sygus_input.h"
  35 #include "parser/tptp/tptp_input.h"
  36 #include "smt_util/command.h"
  37
  38 using namespace std;
  39 using namespace CVC4;
  40 using namespace CVC4::parser;
  41 using namespace CVC4::kind;
  42
  43 namespace CVC4 {
  44 namespace parser {
  45
  46 // These functions exactly wrap the antlr3 source inconsistencies.
  47 // These are the only location CVC4_ANTLR3_OLD_INPUT_STREAM ifdefs appear.
  48 // No other sanity checking happens;
  49 pANTLR3_INPUT_STREAM newAntlr3BufferedStream(std::istream& input, const std::string& name);
  50 pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name);
  51 pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
  52                                             uint32_t size,
  53                                             const std::string& name);
  54
  55 pANTLR3_INPUT_STREAM
  56 newAntlr3BufferedStream(std::istream& input, const std::string& name){
  57   pANTLR3_INPUT_STREAM inputStream = NULL;
  58   pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
  59
  60 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  61   inputStream =
  62     antlr3LineBufferedStreamNew(input, 0, name_duplicate);
  63 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  64   inputStream =
  65     antlr3LineBufferedStreamNew(input, ANTLR3_ENC_8BIT, name_duplicate);
  66 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  67
  68   free(name_duplicate);
  69   return inputStream;
  70 }
  71
  72 pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name){
  73   pANTLR3_INPUT_STREAM input = NULL;
  74   pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
  75
  76   // libantlr3c v3.2 isn't source-compatible with v3.4
  77 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  78   input = antlr3AsciiFileStreamNew(name_duplicate);
  79 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  80   input = antlr3FileStreamNew(name_duplicate, ANTLR3_ENC_8BIT);
  81 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  82
  83   free(name_duplicate);
  84   return input;
  85 }
  86
  87
  88 pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
  89                                             uint32_t size,
  90                                             const std::string& name){
  91   pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
  92   pANTLR3_INPUT_STREAM inputStream = NULL;
  93   /* Create an ANTLR input backed by the buffer. */
  94 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  95   inputStream =
  96     antlr3NewAsciiStringInPlaceStream(basep, size, name_duplicate);
  97 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  98   inputStream =
  99     antlr3StringStreamNew(basep, ANTLR3_ENC_8BIT, size,
 100                           name_duplicate);
 101 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 102   free(name_duplicate);
 103   return inputStream;
 104 }
 105
 106 AntlrInputStream::AntlrInputStream(std::string name,
 107                                    pANTLR3_INPUT_STREAM input,
 108                                    bool fileIsTemporary,
 109                                    pANTLR3_UINT8 inputString) :
 110   InputStream(name, fileIsTemporary),
 111   d_input(input),
 112   d_inputString(inputString)
 113 {
 114   assert( input != NULL );
 115   input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
 116 }
 117
 118 AntlrInputStream::~AntlrInputStream() {
 119   d_input->free(d_input);
 120   if(d_inputString != NULL){
 121     free(d_inputString);
 122   }
 123 }
 124
 125 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
 126   return d_input;
 127 }
 128
 129
 130
 131 AntlrInputStream*
 132 AntlrInputStream::newFileInputStream(const std::string& name,
 133                                      bool useMmap)
 134   throw (InputStreamException) {
 135 #ifdef _WIN32
 136   if(useMmap) {
 137     useMmap = false;
 138   }
 139 #endif
 140   pANTLR3_INPUT_STREAM input = NULL;
 141   if(useMmap) {
 142     input = MemoryMappedInputBufferNew(name);
 143   } else {
 144     input = newAntlr3FileStream(name);
 145   }
 146   if(input == NULL) {
 147     throw InputStreamException("Couldn't open file: " + name);
 148   }
 149   return new AntlrInputStream( name, input, false, NULL );
 150 }
 151
 152
 153 AntlrInputStream*
 154 AntlrInputStream::newStreamInputStream(std::istream& input,
 155                                        const std::string& name,
 156                                        bool lineBuffered)
 157   throw (InputStreamException) {
 158
 159   pANTLR3_INPUT_STREAM inputStream = NULL;
 160   pANTLR3_UINT8 inputStringCopy = NULL;
 161
 162   if(lineBuffered) {
 163     inputStream = newAntlr3BufferedStream(input, name);
 164   } else {
 165
 166     // Since these are all NULL on entry, realloc will be called
 167     char *basep = NULL, *boundp = NULL, *cp = NULL;
 168     /* 64KB seems like a reasonable default size. */
 169     size_t bufSize = 0x10000;
 170
 171     /* Keep going until we can't go no more. */
 172     while( !input.eof() && !input.fail() ) {
 173
 174       if( cp == boundp ) {
 175         /* We ran out of room in the buffer. Realloc at double the size. */
 176         ptrdiff_t offset = cp - basep;
 177         basep = (char *) realloc(basep, bufSize);
 178         if( basep == NULL ) {
 179           throw InputStreamException("Failed buffering input stream: " + name);
 180         }
 181         cp = basep + offset;
 182         boundp = basep + bufSize;
 183         bufSize *= 2;
 184       }
 185
 186       /* Read as much as we have room for. */
 187       input.read( cp, boundp - cp );
 188       cp += input.gcount();
 189     }
 190
 191     /* Make sure the fail bit didn't get set. */
 192     if( !input.eof() ) {
 193       throw InputStreamException("Stream input failed: " + name);
 194     }
 195     ptrdiff_t offset = cp - basep;
 196     assert(offset >= 0);
 197     assert(offset <= std::numeric_limits<uint32_t>::max());
 198     inputStringCopy = (pANTLR3_UINT8)basep;
 199     inputStream = newAntrl3InPlaceStream(inputStringCopy, (uint32_t) offset, name);
 200   }
 201
 202   if( inputStream == NULL ) {
 203     throw InputStreamException("Couldn't initialize input: " + name);
 204   }
 205
 206   return new AntlrInputStream( name, inputStream, false, inputStringCopy );
 207 }
 208
 209
 210 AntlrInputStream*
 211 AntlrInputStream::newStringInputStream(const std::string& input,
 212                                        const std::string& name)
 213   throw (InputStreamException) {
 214
 215   size_t input_size = input.size();
 216   assert(input_size <= std::numeric_limits<uint32_t>::max());
 217
 218   // Ownership of input_duplicate  is transferred to the AntlrInputStream.
 219   pANTLR3_UINT8 input_duplicate = (pANTLR3_UINT8) strdup(input.c_str());
 220
 221   if( input_duplicate == NULL ) {
 222     throw InputStreamException("Couldn't initialize string input: '" + input + "'");
 223   }
 224
 225   pANTLR3_INPUT_STREAM inputStream = newAntrl3InPlaceStream(input_duplicate, (uint32_t)input_size, name);
 226   if( inputStream==NULL ) {
 227     throw InputStreamException("Couldn't initialize string input: '" + input + "'");
 228   }
 229   return new AntlrInputStream( name, inputStream, false, input_duplicate );
 230 }
 231
 232 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
 233   using namespace language::input;
 234
 235   AntlrInput* input;
 236
 237   switch(lang) {
 238   case LANG_CVC4: {
 239     input = new CvcInput(inputStream);
 240     break;
 241   }
 242   case LANG_SMTLIB_V1:
 243     input = new Smt1Input(inputStream);
 244     break;
 245
 246   case LANG_SMTLIB_V2_0:
 247   case LANG_SMTLIB_V2_5:
 248     input = new Smt2Input(inputStream, lang);
 249     break;
 250
 251   case LANG_SYGUS:
 252     input = new SygusInput(inputStream);
 253     break;
 254
 255   case LANG_TPTP:
 256     input = new TptpInput(inputStream);
 257     break;
 258
 259   default:
 260     std::stringstream ss;
 261     ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
 262     throw InputStreamException(ss.str());
 263   }
 264
 265   return input;
 266 }
 267
 268 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
 269     Input(inputStream),
 270     d_lookahead(lookahead),
 271     d_lexer(NULL),
 272     d_parser(NULL),
 273     d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
 274     d_tokenBuffer(NULL) {
 275 }
 276
 277 /*
 278 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
 279   Parser(exprManager,name),
 280   d_lookahead(lookahead) {
 281
 282 }
 283 */
 284
 285 /*
 286 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
 287   Input(exprManager,name),
 288   d_lookahead(lookahead),
 289   d_lexer(NULL),
 290   d_parser(NULL),
 291   d_tokenStream(NULL) {
 292
 293   char* inputStr = strdup(input.c_str());
 294   char* nameStr = strdup(name.c_str());
 295   if( inputStr==NULL || nameStr==NULL ) {
 296     throw ParserException("Couldn't initialize string input: '" + input + "'");
 297   }
 298   d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
 299   if( d_inputStream == NULL ) {
 300     throw ParserException("Couldn't create input stream for string: '" + input + "'");
 301   }
 302
 303 }
 304 */
 305
 306 AntlrInput::~AntlrInput() {
 307   BoundedTokenBufferFree(d_tokenBuffer);
 308 }
 309
 310 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
 311   return d_tokenBuffer->commonTstream;
 312 }
 313
 314 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
 315   pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
 316   assert(lexer!=NULL);
 317   Parser *parser = (Parser*)(lexer->super);
 318   assert(parser!=NULL);
 319   AntlrInput *input = (AntlrInput*) parser->getInput();
 320   assert(input!=NULL);
 321
 322   /* Call the error display routine *if* there's not already a
 323    * parse error pending.  If a parser error is pending, this
 324    * error is probably less important, so we just drop it. */
 325   if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
 326     input->parseError("Error finding next token.");
 327   }
 328 }
 329
 330 void AntlrInput::warning(const std::string& message) {
 331   Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
 332 }
 333
 334
 335
 336 /**
 337  * characters considered part of a simple symbol in SMTLIB.
 338  *
 339  * TODO: Ideally this code shouldn't be smtlib specific (should work
 340  * with CVC language too), but this per-language specialization has
 341  * been left for a later point.
 342  */
 343 inline bool isSimpleChar(char ch) {
 344   return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
 345 }
 346
 347 size_t wholeWordMatch(string input, string pattern, bool (*isWordChar)(char)) {
 348   size_t st = 0;
 349   size_t N = input.size();
 350   while(st < N) {
 351     while( st < N && (*isWordChar)(input[st])  == false ) st++;
 352     size_t en = st;
 353     while(en + 1 < N && (*isWordChar)(input[en + 1]) == true) en++;
 354     if(en - st + 1 == pattern.size()) {
 355       bool match = true;
 356       for(size_t i = 0; match && i < pattern.size(); ++i) {
 357         match &= (pattern[i] == input[st+i]);
 358       }
 359       if(match == true) {
 360         return st;
 361       }
 362     }
 363     st = en + 1;
 364   }
 365   return string::npos;
 366 }
 367
 368 /**
 369  * Gets part of original input and tries to visually hint where the
 370  * error might be.
 371  *
 372  * Something like:
 373  *
 374  *   ...nd (= alpha beta) (= beta delta))
 375  *                                ^
 376  *
 377  * Implementation (as on 2014/04/24):
 378  *
 379  * > if suggested pointer by lexer is under a "simple char", move to
 380  *   start of the word and print pointer there.
 381  *
 382  * > in the other case, it tries to find the nearest word in the error
 383  *   message passed along. if it can't find it, we don't add this
 384  *   visual hint, as experimentally position suggested by lexer was
 385  *   found to be totally unhelpful. (TODO: fix this upstream to
 386  *   improve)
 387  */
 388 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
 389 {
 390   // Is it a multi-line message
 391   bool multilineMessage = (message.find('\n') != string::npos);
 392   // Useful only if it is a multi-line message
 393   int firstLineEnd = message.find('\n');
 394
 395   std::ostringstream ss, slicess;
 396
 397   // Keep first line of message
 398   if(multilineMessage) {
 399     ss << message.substr(0, firstLineEnd) << endl << endl;
 400   } else {
 401     ss << message << endl << endl;
 402   }
 403
 404   int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
 405   int posSliceEnd = posSliceStart + 70;
 406   int caretPos = 0;
 407   int caretPosExtra = 0; // for inital intendation, epilipses etc.
 408
 409   ss << "  "; caretPosExtra += 2;
 410   if(posSliceStart > 0) {
 411     ss << "..."; caretPosExtra += 3;
 412   }
 413
 414   for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
 415     if(i == posSliceEnd) {
 416       ss << "...";
 417       break;
 418     }
 419     if(i < charPositionInLine) { caretPos++; }
 420
 421     if(!isprint(lineStart[i])) {
 422       // non-printable character, something wrong, bail out
 423       return message;
 424     }
 425
 426     ss << (lineStart[i]);
 427     slicess << (lineStart[i]);
 428   }
 429
 430   // adjust position of caret, based on slice and message
 431   {
 432     int caretPosOrig = caretPos;
 433     string slice = slicess.str();
 434     if(isSimpleChar(slice[caretPos])) {
 435       // if alphanumeric, try to go to beginning of word/number
 436       while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
 437       if(caretPos == 0 && posSliceStart > 0) {
 438         // reached start and this is not really the start? bail out
 439         return message;
 440       } else {
 441         // likely it is also in original message? if so, very likely
 442         // we found the right place
 443         string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
 444         size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 445         Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 446         if( matchLoc != string::npos ) {
 447           Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
 448         }
 449       }
 450     } else {
 451       bool foundCaretPos = false;
 452
 453       for(int tries = 0; tries < 2 && caretPos > 0 && !foundCaretPos; ++tries) {
 454         // go to nearest alphanumeric string (before current position),
 455         // see if that word can be found in original message. If so,
 456         // point to that, else keep pointer where it was.
 457         int nearestWordEn = caretPos - 1;
 458         while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
 459           --nearestWordEn;
 460         }
 461         if(isSimpleChar(slice[nearestWordEn])) {
 462           int nearestWordSt = nearestWordEn;
 463           while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
 464             --nearestWordSt;
 465           }
 466           string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
 467           size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 468           Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
 469           Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 470           if( matchLoc != string::npos ) {
 471             Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
 472                                     << nearestWordSt << std::endl;
 473             foundCaretPos = true;
 474           }
 475           caretPos = nearestWordSt;
 476         }
 477       }
 478       if( !foundCaretPos) {
 479         // this doesn't look good. caret generally getting printed
 480         // at unhelpful positions. improve upstream?
 481         return message;
 482       }
 483     }
 484     caretPos += caretPosExtra;
 485   }// end of caret position computation/heuristics
 486
 487   ss << std::endl;
 488   while( caretPos-- > 0 ) {
 489     ss << ' ';
 490   }
 491   ss << '^' << endl;
 492   if(multilineMessage) {
 493      ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
 494   }
 495   return ss.str();
 496 }
 497
 498 void AntlrInput::parseError(const std::string& message, bool eofException)
 499   throw (ParserException) {
 500
 501   string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
 502                                            d_lexer->getCharPositionInLine(d_lexer),
 503                                            message);
 504
 505   Debug("parser") << "Throwing exception: "
 506       << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
 507       << d_lexer->getLine(d_lexer) << "."
 508       << d_lexer->getCharPositionInLine(d_lexer) << ": "
 509       << updatedMessage << endl;
 510   if(eofException) {
 511     throw ParserEndOfFileException(message,
 512                                    (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 513                                    d_lexer->getLine(d_lexer),
 514                                    d_lexer->getCharPositionInLine(d_lexer));
 515   } else {
 516     throw ParserException(updatedMessage,
 517                           (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 518                           d_lexer->getLine(d_lexer),
 519                           d_lexer->getCharPositionInLine(d_lexer));
 520   }
 521 }
 522
 523
 524 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
 525   d_lexer = pLexer;
 526
 527   pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
 528   if( pTokenFactory != NULL ) {
 529     pTokenFactory->close(pTokenFactory);
 530   }
 531
 532   /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
 533   pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
 534   if( pTokenFactory == NULL ) {
 535     throw InputStreamException("Couldn't create token factory.");
 536   }
 537   d_lexer->rec->state->tokFactory = pTokenFactory;
 538
 539   pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
 540   if( buffer == NULL ) {
 541     throw InputStreamException("Couldn't create token buffer.");
 542   }
 543
 544   d_tokenBuffer = buffer;
 545
 546   // Override default lexer error reporting
 547   d_lexer->rec->reportError = &lexerError;
 548   // Override default nextToken function, just to prevent exceptions escaping.
 549   d_lexer->rec->state->tokSource->nextToken = &nextToken;
 550 }
 551
 552 void AntlrInput::setParser(Parser& parser) {
 553   // ANTLR isn't using super in the lexer or the parser, AFAICT.
 554   // We could also use @lexer/parser::context to add a field to the generated
 555   // objects, but then it would have to be declared separately in every
 556   // language's grammar and we'd have to in the address of the field anyway.
 557   d_lexer->super = &parser;
 558   d_parser->super = &parser;
 559 }
 560
 561 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
 562   d_parser = pParser;
 563 //  d_parser->rec->match = &match;
 564   d_parser->rec->reportError = &reportError;
 565   /* Don't try to recover from a parse error. */
 566   // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
 567   d_parser->rec->recoverFromMismatchedToken =
 568       (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
 569       d_parser->rec->mismatch;
 570 }
 571
 572 }/* CVC4::parser namespace */
 573 }/* CVC4 namespace */