src/parser/antlr_input.cpp

   1 /*********************                                                        */
   2 /** antlr_input.cpp
   3  ** Original author: cconway
   4  ** Major contributors: none
   5  ** Minor contributors (to current version): none
   6  ** This file is part of the CVC4 prototype.
   7  ** Copyright (c) 2009, 2010  The Analysis of Computer Systems Group (ACSys)
   8  ** Courant Institute of Mathematical Sciences
   9  ** New York University
  10  ** See the file COPYING in the top-level source directory for licensing
  11  ** information.
  12  **
  13  ** A super-class for ANTLR-generated input language parsers
  14  **/
  15
  16 #include <iostream>
  17 #include <limits.h>
  18 #include <antlr3.h>
  19
  20 #include "antlr_input.h"
  21 #include "bounded_token_buffer.h"
  22 #include "bounded_token_factory.h"
  23 #include "memory_mapped_input_buffer.h"
  24 #include "parser_exception.h"
  25 #include "parser_state.h"
  26
  27 #include "util/output.h"
  28 #include "util/Assert.h"
  29 #include "expr/command.h"
  30 #include "expr/type.h"
  31
  32 using namespace std;
  33 using namespace CVC4;
  34 using namespace CVC4::parser;
  35 using namespace CVC4::kind;
  36
  37 namespace CVC4 {
  38 namespace parser {
  39
  40 AntlrInput::AntlrInput(ExprManager* exprManager, const std::string& filename, unsigned int lookahead, bool useMmap) :
  41     Input(exprManager, filename),
  42     d_lookahead(lookahead),
  43     d_lexer(NULL),
  44     d_parser(NULL),
  45     d_tokenStream(NULL) {
  46
  47   if( useMmap ) {
  48     d_input = MemoryMappedInputBufferNew(filename);
  49   } else {
  50     d_input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) filename.c_str());
  51   }
  52   if( d_input == NULL ) {
  53     throw ParserException("Couldn't open file: " + filename);
  54   }
  55 }
  56
  57 /*
  58 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
  59   Parser(exprManager,name),
  60   d_lookahead(lookahead) {
  61
  62 }
  63 */
  64
  65 AntlrInput::AntlrInput(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
  66   Input(exprManager,name),
  67   d_lookahead(lookahead),
  68   d_lexer(NULL),
  69   d_parser(NULL),
  70   d_tokenStream(NULL) {
  71   char* inputStr = strdup(input.c_str());
  72   char* nameStr = strdup(name.c_str());
  73   if( inputStr==NULL || nameStr==NULL ) {
  74     throw ParserException("Couldn't initialize string input: '" + input + "'");
  75   }
  76   d_input = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
  77   if( d_input == NULL ) {
  78     throw ParserException("Couldn't create input stream for string: '" + input + "'");
  79   }
  80 }
  81
  82 AntlrInput::~AntlrInput() {
  83   d_tokenStream->free(d_tokenStream);
  84   d_input->close(d_input);
  85 }
  86
  87 pANTLR3_INPUT_STREAM AntlrInput::getInputStream() {
  88   return d_input;
  89 }
  90
  91 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
  92   return d_tokenStream;
  93 }
  94
  95
  96 /// Match current input symbol against ttype.  Upon error, do one token
  97 /// insertion or deletion if possible.
  98 /// To turn off single token insertion or deletion error
  99 /// recovery, override mismatchRecover() and have it call
 100 /// plain mismatch(), which does not recover.  Then any error
 101 /// in a rule will cause an exception and immediate exit from
 102 /// rule.  Rule would recover by resynchronizing to the set of
 103 /// symbols that can follow rule ref.
 104 ///
 105 // [chris 4/5/2010] Copy and paste from antlr3baserecognizer.c
 106 void *
 107 AntlrInput::match(pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype,
 108       pANTLR3_BITSET_LIST follow) {
 109   pANTLR3_PARSER parser;
 110   pANTLR3_TREE_PARSER tparser;
 111   pANTLR3_INT_STREAM is;
 112   void * matchedSymbol;
 113
 114   switch(recognizer->type) {
 115   case ANTLR3_TYPE_PARSER:
 116
 117     parser = (pANTLR3_PARSER)(recognizer->super);
 118     tparser = NULL;
 119     is = parser->tstream->istream;
 120
 121     break;
 122
 123   case ANTLR3_TYPE_TREE_PARSER:
 124
 125     tparser = (pANTLR3_TREE_PARSER)(recognizer->super);
 126     parser = NULL;
 127     is = tparser->ctnstream->tnstream->istream;
 128
 129     break;
 130
 131   default:
 132
 133     ANTLR3_FPRINTF(
 134                    stderr,
 135                    "Base recognizer function 'match' called by unknown parser type - provide override for this function\n");
 136     return ANTLR3_FALSE;
 137
 138     break;
 139   }
 140
 141   // Pick up the current input token/node for assignment to labels
 142   //
 143   matchedSymbol = recognizer->getCurrentInputSymbol(recognizer, is);
 144
 145   if(is->_LA(is, 1) == ttype) {
 146     // The token was the one we were told to expect
 147     //
 148     is->consume(is); // Consume that token from the stream
 149     recognizer->state->errorRecovery = ANTLR3_FALSE; // Not in error recovery now (if we were)
 150     recognizer->state->failed = ANTLR3_FALSE; // The match was a success
 151     return matchedSymbol; // We are done
 152   }
 153
 154   // We did not find the expected token type, if we are backtracking then
 155   // we just set the failed flag and return.
 156   //
 157   if(recognizer->state->backtracking > 0) {
 158     // Backtracking is going on
 159     //
 160     recognizer->state->failed = ANTLR3_TRUE;
 161     return matchedSymbol;
 162   }
 163
 164   // We did not find the expected token and there is no backtracking
 165   // going on, so we mismatch, which creates an exception in the recognizer exception
 166   // stack.
 167   //
 168   matchedSymbol = recognizer->recoverFromMismatchedToken(recognizer, ttype,
 169                                                          follow);
 170   return matchedSymbol;
 171 }
 172
 173 void AntlrInput::parseError(const std::string& message)
 174     throw (ParserException) {
 175   Debug("parser") << "Throwing exception: "
 176       << getParserState()->getFilename() << ":"
 177       << d_lexer->getLine(d_lexer) << "."
 178       << d_lexer->getCharPositionInLine(d_lexer) << ": "
 179       << message << endl;
 180   throw ParserException(message, getParserState()->getFilename(),
 181                         d_lexer->getLine(d_lexer),
 182                         d_lexer->getCharPositionInLine(d_lexer));
 183 }
 184
 185 void *
 186 AntlrInput::recoverFromMismatchedToken(pANTLR3_BASE_RECOGNIZER recognizer,
 187                                        ANTLR3_UINT32 ttype,
 188                                        pANTLR3_BITSET_LIST follow) {
 189
 190   pANTLR3_PARSER parser = (pANTLR3_PARSER) (recognizer->super);
 191   pANTLR3_INT_STREAM is = parser->tstream->istream;
 192   void *matchedSymbol;
 193
 194
 195   // Create an exception if we need one
 196   //
 197   if(recognizer->state->exception == NULL) {
 198     antlr3RecognitionExceptionNew(recognizer);
 199   }
 200
 201   if(recognizer->mismatchIsUnwantedToken(recognizer, is, ttype) == ANTLR3_TRUE) {
 202     recognizer->state->exception->type = ANTLR3_UNWANTED_TOKEN_EXCEPTION;
 203     recognizer->state->exception->message
 204         = (void*)ANTLR3_UNWANTED_TOKEN_EXCEPTION_NAME;
 205   }
 206
 207   if(recognizer->mismatchIsMissingToken(recognizer, is, follow)) {
 208     matchedSymbol = recognizer->getMissingSymbol(recognizer, is,
 209                                                  recognizer->state->exception,
 210                                                  ttype, follow);
 211     recognizer->state->exception->type = ANTLR3_MISSING_TOKEN_EXCEPTION;
 212     recognizer->state->exception->message = (void*)ANTLR3_MISSING_TOKEN_EXCEPTION_NAME;
 213     recognizer->state->exception->token = matchedSymbol;
 214     recognizer->state->exception->expecting = ttype;
 215   }
 216
 217   reportError(recognizer);
 218   Unreachable("reportError should have thrown exception in AntlrInput::recoverFromMismatchedToken");
 219 }
 220
 221 void AntlrInput::reportError(pANTLR3_BASE_RECOGNIZER recognizer) {
 222   pANTLR3_EXCEPTION ex = recognizer->state->exception;
 223   pANTLR3_UINT8 * tokenNames = recognizer->state->tokenNames;
 224   stringstream ss;
 225 //  std::string msg;
 226
 227   // Signal we are in error recovery now
 228   recognizer->state->errorRecovery = ANTLR3_TRUE;
 229
 230   // Indicate this recognizer had an error while processing.
 231   recognizer->state->errorCount++;
 232
 233   // Call the builtin error formatter
 234   // recognizer->displayRecognitionError(recognizer, recognizer->state->tokenNames);
 235
 236   /* This switch statement is adapted from antlr3baserecognizer.c:displayRecognitionError in libantlr3c.
 237    * TODO: Make error messages more useful, maybe by including more expected tokens and information
 238    * about the current token. */
 239   switch(ex->type) {
 240   case ANTLR3_UNWANTED_TOKEN_EXCEPTION:
 241
 242     // Indicates that the recognizer was fed a token which seems to be
 243     // spurious input. We can detect this when the token that follows
 244     // this unwanted token would normally be part of the syntactically
 245     // correct stream. Then we can see that the token we are looking at
 246     // is just something that should not be there and throw this exception.
 247     //
 248     if(tokenNames == NULL) {
 249       ss << "Unexpected token." ;
 250     } else {
 251       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 252         ss << "Expected end of file.";
 253       } else {
 254         ss << "Expected " << tokenNames[ex->expecting]
 255            << ", found '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 256       }
 257     }
 258     break;
 259
 260   case ANTLR3_MISSING_TOKEN_EXCEPTION:
 261
 262     // Indicates that the recognizer detected that the token we just
 263     // hit would be valid syntactically if preceded by a particular
 264     // token. Perhaps a missing ';' at line end or a missing ',' in an
 265     // expression list, and such like.
 266     //
 267     if(tokenNames == NULL) {
 268       ss << "Missing token (" << ex->expecting << ").";
 269     } else {
 270       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 271         ss << "Missing end of file marker.";
 272       } else {
 273         ss << "Missing " << tokenNames[ex->expecting] << ".";
 274       }
 275     }
 276     break;
 277
 278   case ANTLR3_RECOGNITION_EXCEPTION:
 279
 280     // Indicates that the recognizer received a token
 281     // in the input that was not predicted. This is the basic exception type
 282     // from which all others are derived. So we assume it was a syntax error.
 283     // You may get this if there are not more tokens and more are needed
 284     // to complete a parse for instance.
 285     //
 286     ss <<"Syntax error.";
 287     break;
 288
 289   case ANTLR3_MISMATCHED_TOKEN_EXCEPTION:
 290
 291     // We were expecting to see one thing and got another. This is the
 292     // most common error if we could not detect a missing or unwanted token.
 293     // Here you can spend your efforts to
 294     // derive more useful error messages based on the expected
 295     // token set and the last token and so on. The error following
 296     // bitmaps do a good job of reducing the set that we were looking
 297     // for down to something small. Knowing what you are parsing may be
 298     // able to allow you to be even more specific about an error.
 299     //
 300     if(tokenNames == NULL) {
 301       ss << "Syntax error.";
 302     } else {
 303       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 304         ss << "Expected end of file.";
 305       } else {
 306         ss << "Expected " << tokenNames[ex->expecting] << ".";
 307       }
 308     }
 309     break;
 310
 311   case ANTLR3_NO_VIABLE_ALT_EXCEPTION:
 312     // We could not pick any alt decision from the input given
 313     // so god knows what happened - however when you examine your grammar,
 314     // you should. It means that at the point where the current token occurred
 315     // that the DFA indicates nowhere to go from here.
 316     //
 317     ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 318     break;
 319
 320   case ANTLR3_MISMATCHED_SET_EXCEPTION:
 321
 322   {
 323     ANTLR3_UINT32 count;
 324     ANTLR3_UINT32 bit;
 325     ANTLR3_UINT32 size;
 326     ANTLR3_UINT32 numbits;
 327     pANTLR3_BITSET errBits;
 328
 329     // This means we were able to deal with one of a set of
 330     // possible tokens at this point, but we did not see any
 331     // member of that set.
 332     //
 333     ss << "Unexpected input: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token)
 334        << "'. Expected one of: ";
 335
 336     // What tokens could we have accepted at this point in the
 337     // parse?
 338     //
 339     count = 0;
 340     errBits = antlr3BitsetLoad(ex->expectingSet);
 341     numbits = errBits->numBits(errBits);
 342     size = errBits->size(errBits);
 343
 344     if(size > 0) {
 345       // However many tokens we could have dealt with here, it is usually
 346       // not useful to print ALL of the set here. I arbitrarily chose 8
 347       // here, but you should do whatever makes sense for you of course.
 348       // No token number 0, so look for bit 1 and on.
 349       //
 350       for(bit = 1; bit < numbits && count < 8 && count < size; bit++) {
 351         // TODO: This doesn;t look right - should be asking if the bit is set!!
 352         //
 353         if(tokenNames[bit]) {
 354           if( count++ > 0 ) {
 355             ss << ", ";
 356           }
 357           ss << tokenNames[bit];
 358         }
 359       }
 360     } else {
 361       Unreachable("Parse error with empty set of expected tokens.");
 362     }
 363   }
 364     break;
 365
 366   case ANTLR3_EARLY_EXIT_EXCEPTION:
 367
 368     // We entered a loop requiring a number of token sequences
 369     // but found a token that ended that sequence earlier than
 370     // we should have done.
 371     //
 372     ss << "Sequence terminated early by token: '"
 373        << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 374     break;
 375
 376   default:
 377
 378     // We don't handle any other exceptions here, but you can
 379     // if you wish. If we get an exception that hits this point
 380     // then we are just going to report what we know about the
 381     // token.
 382     //
 383     Unhandled("Unexpected exception in parser.");
 384     break;
 385   }
 386
 387   // Now get ready to throw an exception
 388   pANTLR3_PARSER parser = (pANTLR3_PARSER)(recognizer->super);
 389   AlwaysAssert(parser!=NULL);
 390   ParserState *parserState = (ParserState*)(parser->super);
 391   AlwaysAssert(parserState!=NULL);
 392
 393   // Call the error display routine
 394   parserState->parseError(ss.str());
 395 }
 396
 397 void AntlrInput::setLexer(pANTLR3_LEXER pLexer) {
 398   d_lexer = pLexer;
 399
 400   pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
 401   if( pTokenFactory != NULL ) {
 402     pTokenFactory->close(pTokenFactory);
 403   }
 404
 405   /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
 406   pTokenFactory = BoundedTokenFactoryNew(d_input, 2*d_lookahead);
 407   if( pTokenFactory == NULL ) {
 408     throw ParserException("Couldn't create token factory.");
 409   }
 410   d_lexer->rec->state->tokFactory = pTokenFactory;
 411
 412   pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
 413   if( buffer == NULL ) {
 414     throw ParserException("Couldn't create token buffer.");
 415   }
 416
 417   d_tokenStream = buffer->commonTstream;
 418 }
 419
 420 void AntlrInput::setParser(pANTLR3_PARSER pParser) {
 421   d_parser = pParser;
 422   // ANTLR isn't using super, AFAICT.
 423   // We could also use @parser::context to add a field to the generated parser, but then
 424   // it would have to be declared separately in every input's grammar and we'd have to
 425   // pass it in as an address anyway.
 426   d_parser->super = getParserState();
 427   d_parser->rec->match = &match;
 428   d_parser->rec->reportError = &reportError;
 429   d_parser->rec->recoverFromMismatchedToken = &recoverFromMismatchedToken;
 430 }
 431
 432
 433 }/* CVC4::parser namespace */
 434 }/* CVC4 namespace */