src/parser/antlr_input.cpp

   1 /*********************                                                        */
   2 /*! \file antlr_input.cpp
   3  ** \verbatim
   4  ** Original author: Christopher L. Conway
   5  ** Major contributors: Morgan Deters
   6  ** Minor contributors (to current version): Francois Bobot, Kshitij Bansal
   7  ** This file is part of the CVC4 project.
   8  ** Copyright (c) 2009-2014  New York University and The University of Iowa
   9  ** See the file COPYING in the top-level source directory for licensing
  10  ** information.\endverbatim
  11  **
  12  ** \brief A super-class for ANTLR-generated input language parsers.
  13  **
  14  ** A super-class for ANTLR-generated input language parsers
  15  **/
  16
  17 #include <limits.h>
  18 #include <antlr3.h>
  19 #include <stdint.h>
  20
  21 #include "parser/antlr_input.h"
  22 #include "parser/input.h"
  23 #include "parser/bounded_token_buffer.h"
  24 #include "parser/bounded_token_factory.h"
  25 #include "parser/antlr_line_buffered_input.h"
  26 #include "parser/memory_mapped_input_buffer.h"
  27 #include "parser/parser_exception.h"
  28 #include "parser/parser.h"
  29
  30 #include "expr/command.h"
  31 #include "expr/type.h"
  32 #include "parser/cvc/cvc_input.h"
  33 #include "parser/smt1/smt1_input.h"
  34 #include "parser/smt2/smt2_input.h"
  35 #include "parser/tptp/tptp_input.h"
  36 #include "util/output.h"
  37
  38 using namespace std;
  39 using namespace CVC4;
  40 using namespace CVC4::parser;
  41 using namespace CVC4::kind;
  42
  43 namespace CVC4 {
  44 namespace parser {
  45
  46 AntlrInputStream::AntlrInputStream(std::string name,
  47                                    pANTLR3_INPUT_STREAM input,
  48                                    bool fileIsTemporary) :
  49   InputStream(name, fileIsTemporary),
  50   d_input(input) {
  51   assert( input != NULL );
  52   input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
  53 }
  54
  55 AntlrInputStream::~AntlrInputStream() {
  56   d_input->free(d_input);
  57 }
  58
  59 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
  60   return d_input;
  61 }
  62
  63 AntlrInputStream*
  64 AntlrInputStream::newFileInputStream(const std::string& name,
  65                                      bool useMmap)
  66   throw (InputStreamException) {
  67 #ifdef _WIN32
  68   if(useMmap) {
  69     useMmap = false;
  70   }
  71 #endif
  72   pANTLR3_INPUT_STREAM input = NULL;
  73   if(useMmap) {
  74     input = MemoryMappedInputBufferNew(name);
  75   } else {
  76     // libantlr3c v3.2 isn't source-compatible with v3.4
  77 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  78     input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) name.c_str());
  79 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  80     input = antlr3FileStreamNew((pANTLR3_UINT8) name.c_str(), ANTLR3_ENC_8BIT);
  81 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  82   }
  83   if(input == NULL) {
  84     throw InputStreamException("Couldn't open file: " + name);
  85   }
  86   return new AntlrInputStream( name, input );
  87 }
  88
  89 AntlrInputStream*
  90 AntlrInputStream::newStreamInputStream(std::istream& input,
  91                                        const std::string& name,
  92                                        bool lineBuffered)
  93   throw (InputStreamException) {
  94
  95   pANTLR3_INPUT_STREAM inputStream = NULL;
  96
  97   if(lineBuffered) {
  98 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  99     inputStream =
 100       antlr3LineBufferedStreamNew(input,
 101                                   0,
 102                                   (pANTLR3_UINT8) strdup(name.c_str()));
 103 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 104     inputStream =
 105       antlr3LineBufferedStreamNew(input,
 106                                   ANTLR3_ENC_8BIT,
 107                                   (pANTLR3_UINT8) strdup(name.c_str()));
 108 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 109   } else {
 110
 111     // Since these are all NULL on entry, realloc will be called
 112     char *basep = NULL, *boundp = NULL, *cp = NULL;
 113     /* 64KB seems like a reasonable default size. */
 114     size_t bufSize = 0x10000;
 115
 116     /* Keep going until we can't go no more. */
 117     while( !input.eof() && !input.fail() ) {
 118
 119       if( cp == boundp ) {
 120         /* We ran out of room in the buffer. Realloc at double the size. */
 121         ptrdiff_t offset = cp - basep;
 122         basep = (char *) realloc(basep, bufSize);
 123         if( basep == NULL ) {
 124           throw InputStreamException("Failed buffering input stream: " + name);
 125         }
 126         cp = basep + offset;
 127         boundp = basep + bufSize;
 128         bufSize *= 2;
 129       }
 130
 131       /* Read as much as we have room for. */
 132       input.read( cp, boundp - cp );
 133       cp += input.gcount();
 134     }
 135
 136     /* Make sure the fail bit didn't get set. */
 137     if( !input.eof() ) {
 138       throw InputStreamException("Stream input failed: " + name);
 139     }
 140
 141     /* Create an ANTLR input backed by the buffer. */
 142 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
 143     inputStream =
 144       antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) basep,
 145                                         cp - basep,
 146                                         (pANTLR3_UINT8) strdup(name.c_str()));
 147 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 148     inputStream =
 149       antlr3StringStreamNew((pANTLR3_UINT8) basep,
 150                             ANTLR3_ENC_8BIT,
 151                             cp - basep,
 152                             (pANTLR3_UINT8) strdup(name.c_str()));
 153 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 154
 155   }
 156
 157   if( inputStream == NULL ) {
 158     throw InputStreamException("Couldn't initialize input: " + name);
 159   }
 160
 161   return new AntlrInputStream( name, inputStream );
 162 }
 163
 164
 165 AntlrInputStream*
 166 AntlrInputStream::newStringInputStream(const std::string& input,
 167                                        const std::string& name)
 168   throw (InputStreamException) {
 169   char* inputStr = strdup(input.c_str());
 170   char* nameStr = strdup(name.c_str());
 171   assert( inputStr!=NULL && nameStr!=NULL );
 172 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
 173   pANTLR3_INPUT_STREAM inputStream =
 174       antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) inputStr,
 175                                         input.size(),
 176                                         (pANTLR3_UINT8) nameStr);
 177 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 178   pANTLR3_INPUT_STREAM inputStream =
 179       antlr3StringStreamNew((pANTLR3_UINT8) inputStr,
 180                             ANTLR3_ENC_8BIT,
 181                             input.size(),
 182                             (pANTLR3_UINT8) nameStr);
 183 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 184   if( inputStream==NULL ) {
 185     throw InputStreamException("Couldn't initialize string input: '" + input + "'");
 186   }
 187   return new AntlrInputStream( name, inputStream );
 188 }
 189
 190 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
 191   using namespace language::input;
 192
 193   AntlrInput* input;
 194
 195   switch(lang) {
 196   case LANG_CVC4: {
 197     input = new CvcInput(inputStream);
 198     break;
 199   }
 200   case LANG_SMTLIB_V1:
 201     input = new Smt1Input(inputStream);
 202     break;
 203
 204   case LANG_SMTLIB_V2:
 205     input = new Smt2Input(inputStream);
 206     break;
 207
 208   case LANG_TPTP:
 209     input = new TptpInput(inputStream);
 210     break;
 211
 212   default:
 213     std::stringstream ss;
 214     ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
 215     throw InputStreamException(ss.str());
 216   }
 217
 218   return input;
 219 }
 220
 221 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
 222     Input(inputStream),
 223     d_lookahead(lookahead),
 224     d_lexer(NULL),
 225     d_parser(NULL),
 226     d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
 227     d_tokenBuffer(NULL) {
 228 }
 229
 230 /*
 231 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
 232   Parser(exprManager,name),
 233   d_lookahead(lookahead) {
 234
 235 }
 236 */
 237
 238 /*
 239 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
 240   Input(exprManager,name),
 241   d_lookahead(lookahead),
 242   d_lexer(NULL),
 243   d_parser(NULL),
 244   d_tokenStream(NULL) {
 245
 246   char* inputStr = strdup(input.c_str());
 247   char* nameStr = strdup(name.c_str());
 248   if( inputStr==NULL || nameStr==NULL ) {
 249     throw ParserException("Couldn't initialize string input: '" + input + "'");
 250   }
 251   d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
 252   if( d_inputStream == NULL ) {
 253     throw ParserException("Couldn't create input stream for string: '" + input + "'");
 254   }
 255
 256 }
 257 */
 258
 259 AntlrInput::~AntlrInput() {
 260   BoundedTokenBufferFree(d_tokenBuffer);
 261 }
 262
 263 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
 264   return d_tokenBuffer->commonTstream;
 265 }
 266
 267 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
 268   pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
 269   assert(lexer!=NULL);
 270   Parser *parser = (Parser*)(lexer->super);
 271   assert(parser!=NULL);
 272   AntlrInput *input = (AntlrInput*) parser->getInput();
 273   assert(input!=NULL);
 274
 275   /* Call the error display routine *if* there's not already a
 276    * parse error pending.  If a parser error is pending, this
 277    * error is probably less important, so we just drop it. */
 278   if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
 279     input->parseError("Error finding next token.");
 280   }
 281 }
 282
 283 void AntlrInput::warning(const std::string& message) {
 284   Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
 285 }
 286
 287
 288 /**
 289  * characters considered part of a simple symbol in SMTLIB.
 290  *
 291  * TODO: Ideally this code shouldn't be smtlib specific (should work
 292  * with CVC language too), but this per-language specialization has
 293  * been left for a later point.
 294  */
 295 inline bool isSimpleChar(char ch) {
 296   return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
 297 }
 298
 299 /**
 300  * Gets part of original input and tries to visually hint where the
 301  * error might be.
 302  *
 303  * Something like:
 304  *
 305  *   ...nd (= alpha beta) (= beta delta))
 306  *                                ^
 307  *
 308  * Implementation (as on 2014/04/24):
 309  *
 310  * > if suggested pointer by lexer is under a "simple char", move to
 311  *   start of the word and print pointer there.
 312  *
 313  * > in the other case, it tries to find the nearest word in the error
 314  *   message passed along. if it can't find it, we don't add this
 315  *   visual hint, as experimentally position suggested by lexer was
 316  *   found to be totally unhelpful. (TODO: fix this upstream to
 317  *   improve)
 318  */
 319 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
 320 {
 321   // Is it a multi-line message
 322   bool multilineMessage = (message.find('\n') != string::npos);
 323   // Useful only if it is a multi-line message
 324   int firstLineEnd = message.find('\n');
 325
 326   std::ostringstream ss, slicess;
 327
 328   // Keep first line of message
 329   if(multilineMessage) {
 330     ss << message.substr(0, firstLineEnd) << endl << endl;
 331   } else {
 332     ss << message << endl << endl;
 333   }
 334
 335   int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
 336   int posSliceEnd = posSliceStart + 70;
 337   int caretPos = 0;
 338   int caretPosExtra = 0; // for inital intendation, epilipses etc.
 339
 340   ss << "  "; caretPosExtra += 2;
 341   if(posSliceStart > 0) {
 342     ss << "..."; caretPosExtra += 3;
 343   }
 344
 345   for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
 346     if(i == posSliceEnd) {
 347       ss << "...";
 348       break;
 349     }
 350     if(i < charPositionInLine) { caretPos++; }
 351
 352     if(!isprint(lineStart[i])) {
 353       // non-printable character, something wrong, bail out
 354       return message;
 355     }
 356
 357     ss << (lineStart[i]);
 358     slicess << (lineStart[i]);
 359   }
 360
 361   // adjust position of caret, based on slice and message
 362   {
 363     int caretPosOrig = caretPos;
 364     string slice = slicess.str();
 365     if(isSimpleChar(slice[caretPos])) {
 366       // if alphanumeric, try to go to beginning of word/number
 367       while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
 368       if(caretPos == 0 && posSliceStart > 0) {
 369         // reached start and this is not really the start? bail out
 370         return message;
 371       } else {
 372         // likely it is also in original message? if so, very likely
 373         // we found the right place
 374         string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
 375         unsigned messagePosSt = message.find(word);
 376         unsigned messagePosEn = messagePosSt + (caretPosOrig - caretPos);
 377         if( messagePosSt < string::npos &&
 378             (messagePosSt == 0 || !isSimpleChar(message[messagePosSt-1]) ) &&
 379             (messagePosEn+1 == message.size() || !isSimpleChar(message[messagePosEn+1]) ) ) {
 380           // ^the complicated if statement is just 'whole-word' match
 381           Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
 382         }
 383       }
 384     } else {
 385       // go to nearest alphanumeric string (before current position),
 386       // see if that word can be found in original message. If so,
 387       // point to that, else keep pointer where it was.
 388       int nearestWordEn = caretPos - 1;
 389       while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
 390         --nearestWordEn;
 391       }
 392       if(isSimpleChar(slice[nearestWordEn])) {
 393         int nearestWordSt = nearestWordEn;
 394         while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
 395           --nearestWordSt;
 396         }
 397         string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
 398         Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
 399         unsigned messagePosSt = message.find(word);
 400         unsigned messagePosEn = messagePosSt + (nearestWordEn - nearestWordSt + 1);
 401         if( messagePosSt < string::npos &&
 402             (messagePosSt == 0 || !isSimpleChar(message[messagePosSt-1]) ) &&
 403             (messagePosEn+1 == message.size() || !isSimpleChar(message[messagePosEn+1]) ) ) {
 404           // ^the complicated if statement is just 'whole-word' match
 405           Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
 406                                   << nearestWordSt << std::endl;
 407           caretPos = nearestWordSt;
 408         } else {
 409           // this doesn't look good. caret generally getting printed
 410           // at unhelpful positions. improve upstream?
 411           return message;
 412         }
 413       }
 414     }
 415     caretPos += caretPosExtra;
 416   }// end of caret position computation/heuristics
 417
 418   ss << std::endl;
 419   while( caretPos-- > 0 ) {
 420     ss << ' ';
 421   }
 422   ss << '^' << endl;
 423   if(multilineMessage) {
 424      ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
 425   }
 426   return ss.str();
 427 }
 428
 429 void AntlrInput::parseError(const std::string& message, bool eofException)
 430   throw (ParserException) {
 431
 432   string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
 433                                            d_lexer->getCharPositionInLine(d_lexer),
 434                                            message);
 435
 436   Debug("parser") << "Throwing exception: "
 437       << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
 438       << d_lexer->getLine(d_lexer) << "."
 439       << d_lexer->getCharPositionInLine(d_lexer) << ": "
 440       << updatedMessage << endl;
 441   if(eofException) {
 442     throw ParserEndOfFileException(message,
 443                                    (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 444                                    d_lexer->getLine(d_lexer),
 445                                    d_lexer->getCharPositionInLine(d_lexer));
 446   } else {
 447     throw ParserException(updatedMessage,
 448                           (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 449                           d_lexer->getLine(d_lexer),
 450                           d_lexer->getCharPositionInLine(d_lexer));
 451   }
 452 }
 453
 454
 455 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
 456   d_lexer = pLexer;
 457
 458   pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
 459   if( pTokenFactory != NULL ) {
 460     pTokenFactory->close(pTokenFactory);
 461   }
 462
 463   /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
 464   pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
 465   if( pTokenFactory == NULL ) {
 466     throw InputStreamException("Couldn't create token factory.");
 467   }
 468   d_lexer->rec->state->tokFactory = pTokenFactory;
 469
 470   pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
 471   if( buffer == NULL ) {
 472     throw InputStreamException("Couldn't create token buffer.");
 473   }
 474
 475   d_tokenBuffer = buffer;
 476
 477   // Override default lexer error reporting
 478   d_lexer->rec->reportError = &lexerError;
 479   // Override default nextToken function, just to prevent exceptions escaping.
 480   d_lexer->rec->state->tokSource->nextToken = &nextToken;
 481 }
 482
 483 void AntlrInput::setParser(Parser& parser) {
 484   // ANTLR isn't using super in the lexer or the parser, AFAICT.
 485   // We could also use @lexer/parser::context to add a field to the generated
 486   // objects, but then it would have to be declared separately in every
 487   // language's grammar and we'd have to in the address of the field anyway.
 488   d_lexer->super = &parser;
 489   d_parser->super = &parser;
 490 }
 491
 492 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
 493   d_parser = pParser;
 494 //  d_parser->rec->match = &match;
 495   d_parser->rec->reportError = &reportError;
 496   /* Don't try to recover from a parse error. */
 497   // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
 498   d_parser->rec->recoverFromMismatchedToken =
 499       (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
 500       d_parser->rec->mismatch;
 501 }
 502
 503 }/* CVC4::parser namespace */
 504 }/* CVC4 namespace */