src/parser/antlr_input.cpp

   1 /*********************                                                        */
   2 /*! \file antlr_input.cpp
   3  ** \verbatim
   4  ** Original author: Christopher L. Conway
   5  ** Major contributors: Morgan Deters, Kshitij Bansal
   6  ** Minor contributors (to current version): Francois Bobot
   7  ** This file is part of the CVC4 project.
   8  ** Copyright (c) 2009-2014  New York University and The University of Iowa
   9  ** See the file COPYING in the top-level source directory for licensing
  10  ** information.\endverbatim
  11  **
  12  ** \brief A super-class for ANTLR-generated input language parsers.
  13  **
  14  ** A super-class for ANTLR-generated input language parsers
  15  **/
  16
  17 #include "parser/antlr_input.h"
  18 // We rely on the inclusion of #include <antlr3.h> in "parser/antlr_input.h".
  19 // This is avoid having to undefine the symbols in <antlr3.h>.
  20 // See the documentation in "parser/antlr_undefines.h" for more
  21 // details.
  22
  23 #include <limits.h>
  24 #include <stdint.h>
  25
  26 #include "base/output.h"
  27 #include "expr/type.h"
  28 #include "parser/antlr_line_buffered_input.h"
  29 #include "parser/bounded_token_buffer.h"
  30 #include "parser/bounded_token_factory.h"
  31 #include "parser/cvc/cvc_input.h"
  32 #include "parser/input.h"
  33 #include "parser/memory_mapped_input_buffer.h"
  34 #include "parser/parser.h"
  35 #include "parser/parser_exception.h"
  36 #include "parser/smt1/smt1_input.h"
  37 #include "parser/smt2/smt2_input.h"
  38 #include "parser/smt2/sygus_input.h"
  39 #include "parser/tptp/tptp_input.h"
  40 #include "smt_util/command.h"
  41
  42 using namespace std;
  43 using namespace CVC4;
  44 using namespace CVC4::parser;
  45 using namespace CVC4::kind;
  46
  47 namespace CVC4 {
  48 namespace parser {
  49
  50 // These functions exactly wrap the antlr3 source inconsistencies.
  51 // These are the only location CVC4_ANTLR3_OLD_INPUT_STREAM ifdefs appear.
  52 // No other sanity checking happens;
  53 pANTLR3_INPUT_STREAM newAntlr3BufferedStream(std::istream& input, const std::string& name);
  54 pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name);
  55 pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
  56                                             uint32_t size,
  57                                             const std::string& name);
  58
  59 pANTLR3_INPUT_STREAM
  60 newAntlr3BufferedStream(std::istream& input, const std::string& name){
  61   pANTLR3_INPUT_STREAM inputStream = NULL;
  62   pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
  63
  64 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  65   inputStream =
  66     antlr3LineBufferedStreamNew(input, 0, name_duplicate);
  67 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  68   inputStream =
  69     antlr3LineBufferedStreamNew(input, ANTLR3_ENC_8BIT, name_duplicate);
  70 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  71
  72   free(name_duplicate);
  73   return inputStream;
  74 }
  75
  76 pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name){
  77   pANTLR3_INPUT_STREAM input = NULL;
  78   pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
  79
  80   // libantlr3c v3.2 isn't source-compatible with v3.4
  81 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  82   input = antlr3AsciiFileStreamNew(name_duplicate);
  83 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  84   input = antlr3FileStreamNew(name_duplicate, ANTLR3_ENC_8BIT);
  85 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
  86
  87   free(name_duplicate);
  88   return input;
  89 }
  90
  91
  92 pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
  93                                             uint32_t size,
  94                                             const std::string& name){
  95   pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
  96   pANTLR3_INPUT_STREAM inputStream = NULL;
  97   /* Create an ANTLR input backed by the buffer. */
  98 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
  99   inputStream =
 100     antlr3NewAsciiStringInPlaceStream(basep, size, name_duplicate);
 101 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 102   inputStream =
 103     antlr3StringStreamNew(basep, ANTLR3_ENC_8BIT, size,
 104                           name_duplicate);
 105 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
 106   free(name_duplicate);
 107   return inputStream;
 108 }
 109
 110 AntlrInputStream::AntlrInputStream(std::string name,
 111                                    pANTLR3_INPUT_STREAM input,
 112                                    bool fileIsTemporary,
 113                                    pANTLR3_UINT8 inputString) :
 114   InputStream(name, fileIsTemporary),
 115   d_input(input),
 116   d_inputString(inputString)
 117 {
 118   assert( input != NULL );
 119   input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
 120 }
 121
 122 AntlrInputStream::~AntlrInputStream() {
 123   d_input->free(d_input);
 124   if(d_inputString != NULL){
 125     free(d_inputString);
 126   }
 127 }
 128
 129 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
 130   return d_input;
 131 }
 132
 133
 134
 135 AntlrInputStream*
 136 AntlrInputStream::newFileInputStream(const std::string& name,
 137                                      bool useMmap)
 138   throw (InputStreamException) {
 139 #ifdef _WIN32
 140   if(useMmap) {
 141     useMmap = false;
 142   }
 143 #endif
 144   pANTLR3_INPUT_STREAM input = NULL;
 145   if(useMmap) {
 146     input = MemoryMappedInputBufferNew(name);
 147   } else {
 148     input = newAntlr3FileStream(name);
 149   }
 150   if(input == NULL) {
 151     throw InputStreamException("Couldn't open file: " + name);
 152   }
 153   return new AntlrInputStream( name, input, false, NULL );
 154 }
 155
 156
 157 AntlrInputStream*
 158 AntlrInputStream::newStreamInputStream(std::istream& input,
 159                                        const std::string& name,
 160                                        bool lineBuffered)
 161   throw (InputStreamException) {
 162
 163   pANTLR3_INPUT_STREAM inputStream = NULL;
 164   pANTLR3_UINT8 inputStringCopy = NULL;
 165
 166   if(lineBuffered) {
 167     inputStream = newAntlr3BufferedStream(input, name);
 168   } else {
 169
 170     // Since these are all NULL on entry, realloc will be called
 171     char *basep = NULL, *boundp = NULL, *cp = NULL;
 172     /* 64KB seems like a reasonable default size. */
 173     size_t bufSize = 0x10000;
 174
 175     /* Keep going until we can't go no more. */
 176     while( !input.eof() && !input.fail() ) {
 177
 178       if( cp == boundp ) {
 179         /* We ran out of room in the buffer. Realloc at double the size. */
 180         ptrdiff_t offset = cp - basep;
 181         basep = (char *) realloc(basep, bufSize);
 182         if( basep == NULL ) {
 183           throw InputStreamException("Failed buffering input stream: " + name);
 184         }
 185         cp = basep + offset;
 186         boundp = basep + bufSize;
 187         bufSize *= 2;
 188       }
 189
 190       /* Read as much as we have room for. */
 191       input.read( cp, boundp - cp );
 192       cp += input.gcount();
 193     }
 194
 195     /* Make sure the fail bit didn't get set. */
 196     if( !input.eof() ) {
 197       throw InputStreamException("Stream input failed: " + name);
 198     }
 199     ptrdiff_t offset = cp - basep;
 200     assert(offset >= 0);
 201     assert(offset <= std::numeric_limits<uint32_t>::max());
 202     inputStringCopy = (pANTLR3_UINT8)basep;
 203     inputStream = newAntrl3InPlaceStream(inputStringCopy, (uint32_t) offset, name);
 204   }
 205
 206   if( inputStream == NULL ) {
 207     throw InputStreamException("Couldn't initialize input: " + name);
 208   }
 209
 210   return new AntlrInputStream( name, inputStream, false, inputStringCopy );
 211 }
 212
 213
 214 AntlrInputStream*
 215 AntlrInputStream::newStringInputStream(const std::string& input,
 216                                        const std::string& name)
 217   throw (InputStreamException) {
 218
 219   size_t input_size = input.size();
 220   assert(input_size <= std::numeric_limits<uint32_t>::max());
 221
 222   // Ownership of input_duplicate  is transferred to the AntlrInputStream.
 223   pANTLR3_UINT8 input_duplicate = (pANTLR3_UINT8) strdup(input.c_str());
 224
 225   if( input_duplicate == NULL ) {
 226     throw InputStreamException("Couldn't initialize string input: '" + input + "'");
 227   }
 228
 229   pANTLR3_INPUT_STREAM inputStream = newAntrl3InPlaceStream(input_duplicate, (uint32_t)input_size, name);
 230   if( inputStream==NULL ) {
 231     throw InputStreamException("Couldn't initialize string input: '" + input + "'");
 232   }
 233   return new AntlrInputStream( name, inputStream, false, input_duplicate );
 234 }
 235
 236 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
 237   using namespace language::input;
 238
 239   AntlrInput* input;
 240
 241   switch(lang) {
 242   case LANG_CVC4: {
 243     input = new CvcInput(inputStream);
 244     break;
 245   }
 246   case LANG_SMTLIB_V1:
 247     input = new Smt1Input(inputStream);
 248     break;
 249
 250   case LANG_SMTLIB_V2_0:
 251   case LANG_SMTLIB_V2_5:
 252     input = new Smt2Input(inputStream, lang);
 253     break;
 254
 255   case LANG_SYGUS:
 256     input = new SygusInput(inputStream);
 257     break;
 258
 259   case LANG_TPTP:
 260     input = new TptpInput(inputStream);
 261     break;
 262
 263   default:
 264     std::stringstream ss;
 265     ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
 266     throw InputStreamException(ss.str());
 267   }
 268
 269   return input;
 270 }
 271
 272 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
 273     Input(inputStream),
 274     d_lookahead(lookahead),
 275     d_lexer(NULL),
 276     d_parser(NULL),
 277     d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
 278     d_tokenBuffer(NULL) {
 279 }
 280
 281 /*
 282 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
 283   Parser(exprManager,name),
 284   d_lookahead(lookahead) {
 285
 286 }
 287 */
 288
 289 /*
 290 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
 291   Input(exprManager,name),
 292   d_lookahead(lookahead),
 293   d_lexer(NULL),
 294   d_parser(NULL),
 295   d_tokenStream(NULL) {
 296
 297   char* inputStr = strdup(input.c_str());
 298   char* nameStr = strdup(name.c_str());
 299   if( inputStr==NULL || nameStr==NULL ) {
 300     throw ParserException("Couldn't initialize string input: '" + input + "'");
 301   }
 302   d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
 303   if( d_inputStream == NULL ) {
 304     throw ParserException("Couldn't create input stream for string: '" + input + "'");
 305   }
 306
 307 }
 308 */
 309
 310 AntlrInput::~AntlrInput() {
 311   BoundedTokenBufferFree(d_tokenBuffer);
 312 }
 313
 314 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
 315   return d_tokenBuffer->commonTstream;
 316 }
 317
 318 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
 319   pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
 320   assert(lexer!=NULL);
 321   Parser *parser = (Parser*)(lexer->super);
 322   assert(parser!=NULL);
 323   AntlrInput *input = (AntlrInput*) parser->getInput();
 324   assert(input!=NULL);
 325
 326   /* Call the error display routine *if* there's not already a
 327    * parse error pending.  If a parser error is pending, this
 328    * error is probably less important, so we just drop it. */
 329   if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
 330     input->parseError("Error finding next token.");
 331   }
 332 }
 333
 334 void AntlrInput::warning(const std::string& message) {
 335   Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
 336 }
 337
 338
 339
 340 /**
 341  * characters considered part of a simple symbol in SMTLIB.
 342  *
 343  * TODO: Ideally this code shouldn't be smtlib specific (should work
 344  * with CVC language too), but this per-language specialization has
 345  * been left for a later point.
 346  */
 347 inline bool isSimpleChar(char ch) {
 348   return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
 349 }
 350
 351 size_t wholeWordMatch(string input, string pattern, bool (*isWordChar)(char)) {
 352   size_t st = 0;
 353   size_t N = input.size();
 354   while(st < N) {
 355     while( st < N && (*isWordChar)(input[st])  == false ) st++;
 356     size_t en = st;
 357     while(en + 1 < N && (*isWordChar)(input[en + 1]) == true) en++;
 358     if(en - st + 1 == pattern.size()) {
 359       bool match = true;
 360       for(size_t i = 0; match && i < pattern.size(); ++i) {
 361         match &= (pattern[i] == input[st+i]);
 362       }
 363       if(match == true) {
 364         return st;
 365       }
 366     }
 367     st = en + 1;
 368   }
 369   return string::npos;
 370 }
 371
 372 /**
 373  * Gets part of original input and tries to visually hint where the
 374  * error might be.
 375  *
 376  * Something like:
 377  *
 378  *   ...nd (= alpha beta) (= beta delta))
 379  *                                ^
 380  *
 381  * Implementation (as on 2014/04/24):
 382  *
 383  * > if suggested pointer by lexer is under a "simple char", move to
 384  *   start of the word and print pointer there.
 385  *
 386  * > in the other case, it tries to find the nearest word in the error
 387  *   message passed along. if it can't find it, we don't add this
 388  *   visual hint, as experimentally position suggested by lexer was
 389  *   found to be totally unhelpful. (TODO: fix this upstream to
 390  *   improve)
 391  */
 392 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
 393 {
 394   // Is it a multi-line message
 395   bool multilineMessage = (message.find('\n') != string::npos);
 396   // Useful only if it is a multi-line message
 397   int firstLineEnd = message.find('\n');
 398
 399   std::ostringstream ss, slicess;
 400
 401   // Keep first line of message
 402   if(multilineMessage) {
 403     ss << message.substr(0, firstLineEnd) << endl << endl;
 404   } else {
 405     ss << message << endl << endl;
 406   }
 407
 408   int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
 409   int posSliceEnd = posSliceStart + 70;
 410   int caretPos = 0;
 411   int caretPosExtra = 0; // for inital intendation, epilipses etc.
 412
 413   ss << "  "; caretPosExtra += 2;
 414   if(posSliceStart > 0) {
 415     ss << "..."; caretPosExtra += 3;
 416   }
 417
 418   for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
 419     if(i == posSliceEnd) {
 420       ss << "...";
 421       break;
 422     }
 423     if(i < charPositionInLine) { caretPos++; }
 424
 425     if(!isprint(lineStart[i])) {
 426       // non-printable character, something wrong, bail out
 427       return message;
 428     }
 429
 430     ss << (lineStart[i]);
 431     slicess << (lineStart[i]);
 432   }
 433
 434   // adjust position of caret, based on slice and message
 435   {
 436     int caretPosOrig = caretPos;
 437     string slice = slicess.str();
 438     if(isSimpleChar(slice[caretPos])) {
 439       // if alphanumeric, try to go to beginning of word/number
 440       while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
 441       if(caretPos == 0 && posSliceStart > 0) {
 442         // reached start and this is not really the start? bail out
 443         return message;
 444       } else {
 445         // likely it is also in original message? if so, very likely
 446         // we found the right place
 447         string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
 448         size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 449         Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 450         if( matchLoc != string::npos ) {
 451           Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
 452         }
 453       }
 454     } else {
 455       bool foundCaretPos = false;
 456
 457       for(int tries = 0; tries < 2 && caretPos > 0 && !foundCaretPos; ++tries) {
 458         // go to nearest alphanumeric string (before current position),
 459         // see if that word can be found in original message. If so,
 460         // point to that, else keep pointer where it was.
 461         int nearestWordEn = caretPos - 1;
 462         while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
 463           --nearestWordEn;
 464         }
 465         if(isSimpleChar(slice[nearestWordEn])) {
 466           int nearestWordSt = nearestWordEn;
 467           while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
 468             --nearestWordSt;
 469           }
 470           string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
 471           size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
 472           Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
 473           Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
 474           if( matchLoc != string::npos ) {
 475             Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
 476                                     << nearestWordSt << std::endl;
 477             foundCaretPos = true;
 478           }
 479           caretPos = nearestWordSt;
 480         }
 481       }
 482       if( !foundCaretPos) {
 483         // this doesn't look good. caret generally getting printed
 484         // at unhelpful positions. improve upstream?
 485         return message;
 486       }
 487     }
 488     caretPos += caretPosExtra;
 489   }// end of caret position computation/heuristics
 490
 491   ss << std::endl;
 492   while( caretPos-- > 0 ) {
 493     ss << ' ';
 494   }
 495   ss << '^' << endl;
 496   if(multilineMessage) {
 497      ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
 498   }
 499   return ss.str();
 500 }
 501
 502 void AntlrInput::parseError(const std::string& message, bool eofException)
 503   throw (ParserException) {
 504
 505   string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
 506                                            d_lexer->getCharPositionInLine(d_lexer),
 507                                            message);
 508
 509   Debug("parser") << "Throwing exception: "
 510       << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
 511       << d_lexer->getLine(d_lexer) << "."
 512       << d_lexer->getCharPositionInLine(d_lexer) << ": "
 513       << updatedMessage << endl;
 514   if(eofException) {
 515     throw ParserEndOfFileException(message,
 516                                    (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 517                                    d_lexer->getLine(d_lexer),
 518                                    d_lexer->getCharPositionInLine(d_lexer));
 519   } else {
 520     throw ParserException(updatedMessage,
 521                           (const char*)d_lexer->rec->state->tokSource->fileName->chars,
 522                           d_lexer->getLine(d_lexer),
 523                           d_lexer->getCharPositionInLine(d_lexer));
 524   }
 525 }
 526
 527
 528 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
 529   d_lexer = pLexer;
 530
 531   pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
 532   if( pTokenFactory != NULL ) {
 533     pTokenFactory->close(pTokenFactory);
 534   }
 535
 536   /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
 537   pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
 538   if( pTokenFactory == NULL ) {
 539     throw InputStreamException("Couldn't create token factory.");
 540   }
 541   d_lexer->rec->state->tokFactory = pTokenFactory;
 542
 543   pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
 544   if( buffer == NULL ) {
 545     throw InputStreamException("Couldn't create token buffer.");
 546   }
 547
 548   d_tokenBuffer = buffer;
 549
 550   // Override default lexer error reporting
 551   d_lexer->rec->reportError = &lexerError;
 552   // Override default nextToken function, just to prevent exceptions escaping.
 553   d_lexer->rec->state->tokSource->nextToken = &nextToken;
 554 }
 555
 556 void AntlrInput::setParser(Parser& parser) {
 557   // ANTLR isn't using super in the lexer or the parser, AFAICT.
 558   // We could also use @lexer/parser::context to add a field to the generated
 559   // objects, but then it would have to be declared separately in every
 560   // language's grammar and we'd have to in the address of the field anyway.
 561   d_lexer->super = &parser;
 562   d_parser->super = &parser;
 563 }
 564
 565 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
 566   d_parser = pParser;
 567 //  d_parser->rec->match = &match;
 568   d_parser->rec->reportError = &reportError;
 569   /* Don't try to recover from a parse error. */
 570   // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
 571   d_parser->rec->recoverFromMismatchedToken =
 572       (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
 573       d_parser->rec->mismatch;
 574 }
 575
 576 }/* CVC4::parser namespace */
 577 }/* CVC4 namespace */