src/parser/antlr_input_imports.cpp

   1 /*********************                                                        */
   2 /*! \file antlr_input_imports.cpp
   3  ** \verbatim
   4  ** Original author: cconway
   5  ** Major contributors: none
   6  ** Minor contributors (to current version): none
   7  ** This file is part of the CVC4 prototype.
   8  ** Copyright (c) 2009, 2010  The Analysis of Computer Systems Group (ACSys)
   9  ** Courant Institute of Mathematical Sciences
  10  ** New York University
  11  ** See the file COPYING in the top-level source directory for licensing
  12  ** information.\endverbatim
  13  **
  14  ** \brief [[ Add one-line brief description here ]]
  15  **
  16  ** [[ Add lengthier description here ]]
  17  ** \todo document this file
  18  **/
  19
  20 /*
  21  * The functions in this file are based on implementations in libantlr3c,
  22  * with only minor CVC4-specific changes.
  23  */
  24
  25 // [The "BSD licence"]
  26 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
  27 // http://www.temporal-wave.com
  28 // http://www.linkedin.com/in/jimidle
  29 //
  30 // All rights reserved.
  31 //
  32 // Redistribution and use in source and binary forms, with or without
  33 // modification, are permitted provided that the following conditions
  34 // are met:
  35 // 1. Redistributions of source code must retain the above copyright
  36 //    notice, this list of conditions and the following disclaimer.
  37 // 2. Redistributions in binary form must reproduce the above copyright
  38 //    notice, this list of conditions and the following disclaimer in the
  39 //    documentation and/or other materials provided with the distribution.
  40 // 3. The name of the author may not be used to endorse or promote products
  41 //    derived from this software without specific prior written permission.
  42 //
  43 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  44 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  45 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  46 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  47 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  48 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  52 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53
  54
  55
  56 // These headers must be the first two included.
  57 // See the documentation in "parser/antlr_undefines.h" for more details.
  58 #include <antlr3.h>
  59 #include "parser/antlr_undefines.h"
  60
  61 #include <sstream>
  62
  63 #include "parser/antlr_input.h"
  64 #include "parser/parser.h"
  65 #include "parser/parser_exception.h"
  66
  67 using namespace std;
  68
  69 namespace CVC4 {
  70 namespace parser {
  71
  72 /// Report a recognition problem.
  73 ///
  74 /// This method sets errorRecovery to indicate the parser is recovering
  75 /// not parsing.  Once in recovery mode, no errors are generated.
  76 /// To get out of recovery mode, the parser must successfully match
  77 /// a token (after a resync).  So it will go:
  78 ///
  79 ///             1. error occurs
  80 ///             2. enter recovery mode, report error
  81 ///             3. consume until token found in resynch set
  82 ///             4. try to resume parsing
  83 ///             5. next match() will reset errorRecovery mode
  84 ///
  85 /// If you override, make sure to update errorCount if you care about that.
  86 ///
  87 /* *** CVC4 NOTE ***
  88  * This function is has been modified in not-completely-trivial ways from its
  89  * libantlr3c implementation to support more informative error messages and to
  90  * invoke the error reporting mechanism of the Input class instead of the
  91  * default error printer.
  92  */
  93 void AntlrInput::reportError(pANTLR3_BASE_RECOGNIZER recognizer) {
  94   pANTLR3_EXCEPTION ex = recognizer->state->exception;
  95   pANTLR3_UINT8 * tokenNames = recognizer->state->tokenNames;
  96   stringstream ss;
  97
  98   // Dig the CVC4 objects out of the ANTLR3 mess
  99   pANTLR3_PARSER antlr3Parser = (pANTLR3_PARSER)(recognizer->super);
 100   assert(antlr3Parser!=NULL);
 101   Parser *parser = (Parser*)(antlr3Parser->super);
 102   assert(parser!=NULL);
 103   AntlrInput *input = (AntlrInput*) parser->getInput() ;
 104   assert(input!=NULL);
 105
 106   // Signal we are in error recovery now
 107   recognizer->state->errorRecovery = ANTLR3_TRUE;
 108
 109   // Indicate this recognizer had an error while processing.
 110   recognizer->state->errorCount++;
 111
 112   // Call the builtin error formatter
 113   // recognizer->displayRecognitionError(recognizer, recognizer->state->tokenNames);
 114
 115   /* TODO: Make error messages more useful, maybe by including more expected tokens and information
 116    * about the current token. */
 117   switch(ex->type) {
 118   case ANTLR3_UNWANTED_TOKEN_EXCEPTION:
 119
 120     // Indicates that the recognizer was fed a token which seems to be
 121     // spurious input. We can detect this when the token that follows
 122     // this unwanted token would normally be part of the syntactically
 123     // correct stream. Then we can see that the token we are looking at
 124     // is just something that should not be there and throw this exception.
 125     //
 126     if(tokenNames == NULL) {
 127       ss << "Unexpected token." ;
 128     } else {
 129       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 130         ss << "Expected end of file.";
 131       } else {
 132         ss << "Expected " << tokenNames[ex->expecting]
 133            << ", found '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 134       }
 135     }
 136     break;
 137
 138   case ANTLR3_MISSING_TOKEN_EXCEPTION:
 139
 140     // Indicates that the recognizer detected that the token we just
 141     // hit would be valid syntactically if preceded by a particular
 142     // token. Perhaps a missing ';' at line end or a missing ',' in an
 143     // expression list, and such like.
 144     //
 145     if(tokenNames == NULL) {
 146       ss << "Missing token (" << ex->expecting << ").";
 147     } else {
 148       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 149         ss << "Missing end of file marker.";
 150       } else if( ex->expecting == 0 ) {
 151         ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 152         if( std::string(tokenText((pANTLR3_COMMON_TOKEN)ex->token)) == std::string("IN") ) {
 153           ss << " Did you mean: `IS_IN'?";
 154         }
 155       } else {
 156         ss << "Missing " << tokenNames[ex->expecting] << ".";
 157       }
 158     }
 159     break;
 160
 161   case ANTLR3_RECOGNITION_EXCEPTION:
 162
 163     // Indicates that the recognizer received a token
 164     // in the input that was not predicted. This is the basic exception type
 165     // from which all others are derived. So we assume it was a syntax error.
 166     // You may get this if there are not more tokens and more are needed
 167     // to complete a parse for instance.
 168     //
 169     ss <<"Syntax error.";
 170     break;
 171
 172   case ANTLR3_MISMATCHED_TOKEN_EXCEPTION:
 173
 174     // We were expecting to see one thing and got another. This is the
 175     // most common error if we could not detect a missing or unwanted token.
 176     // Here you can spend your efforts to
 177     // derive more useful error messages based on the expected
 178     // token set and the last token and so on. The error following
 179     // bitmaps do a good job of reducing the set that we were looking
 180     // for down to something small. Knowing what you are parsing may be
 181     // able to allow you to be even more specific about an error.
 182     //
 183     if(tokenNames == NULL) {
 184       ss << "Syntax error.";
 185     } else {
 186       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 187         ss << "Expected end of file.";
 188       } else if( ex->expecting == 0 ) {
 189         ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 190       } else {
 191         ss << "Expected " << tokenNames[ex->expecting] << ".";
 192       }
 193     }
 194     break;
 195
 196   case ANTLR3_NO_VIABLE_ALT_EXCEPTION:
 197     // We could not pick any alt decision from the input given
 198     // so god knows what happened - however when you examine your grammar,
 199     // you should. It means that at the point where the current token occurred
 200     // that the DFA indicates nowhere to go from here.
 201     //
 202     ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 203     break;
 204
 205   case ANTLR3_MISMATCHED_SET_EXCEPTION:
 206
 207   {
 208     ANTLR3_UINT32 count;
 209     ANTLR3_UINT32 bit;
 210     ANTLR3_UINT32 size;
 211     ANTLR3_UINT32 numbits;
 212     pANTLR3_BITSET errBits;
 213
 214     // This means we were able to deal with one of a set of
 215     // possible tokens at this point, but we did not see any
 216     // member of that set.
 217     //
 218     ss << "Unexpected input: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token)
 219        << "'. Expected one of: ";
 220
 221     // What tokens could we have accepted at this point in the
 222     // parse?
 223     //
 224     count = 0;
 225     errBits = antlr3BitsetLoad(ex->expectingSet);
 226     numbits = errBits->numBits(errBits);
 227     size = errBits->size(errBits);
 228
 229     if(size > 0) {
 230       // However many tokens we could have dealt with here, it is usually
 231       // not useful to print ALL of the set here. I arbitrarily chose 8
 232       // here, but you should do whatever makes sense for you of course.
 233       // No token number 0, so look for bit 1 and on.
 234       //
 235       for(bit = 1; bit < numbits && count < 8 && count < size; bit++) {
 236         // TODO: This doesn;t look right - should be asking if the bit is set!!
 237         //
 238         if(tokenNames[bit]) {
 239           if( count++ > 0 ) {
 240             ss << ", ";
 241           }
 242           ss << tokenNames[bit];
 243         }
 244       }
 245     } else {
 246       assert(false);//("Parse error with empty set of expected tokens.");
 247     }
 248   }
 249     break;
 250
 251   case ANTLR3_EARLY_EXIT_EXCEPTION:
 252
 253     // We entered a loop requiring a number of token sequences
 254     // but found a token that ended that sequence earlier than
 255     // we should have done.
 256     //
 257     ss << "Sequence terminated early by token: '"
 258        << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 259     break;
 260
 261   default:
 262
 263     // We don't handle any other exceptions here, but you can
 264     // if you wish. If we get an exception that hits this point
 265     // then we are just going to report what we know about the
 266     // token.
 267     //
 268     assert(false);//("Unexpected exception in parser.");
 269     break;
 270   }
 271
 272   // Call the error display routine
 273   input->parseError(ss.str(), ((pANTLR3_COMMON_TOKEN)ex->token)->type == ANTLR3_TOKEN_EOF);
 274 }
 275
 276 ///
 277 /// \brief
 278 /// Returns the next available token from the current input stream.
 279 ///
 280 /// \param toksource
 281 /// Points to the implementation of a token source. The lexer is
 282 /// addressed by the super structure pointer.
 283 ///
 284 /// \returns
 285 /// The next token in the current input stream or the EOF token
 286 /// if there are no more tokens.
 287 ///
 288 /// \remarks
 289 /// Write remarks for nextToken here.
 290 ///
 291 /// \see nextToken
 292 ///
 293 /* *** CVC4 NOTE ***
 294  * This is copied, largely unmodified, from antlr3lexer.c
 295  *
 296  */
 297 pANTLR3_COMMON_TOKEN
 298 AntlrInput::nextTokenStr (pANTLR3_TOKEN_SOURCE toksource)
 299 {
 300   pANTLR3_LEXER lexer;
 301
 302   lexer = (pANTLR3_LEXER)(toksource->super);
 303
 304   /// Loop until we get a non skipped token or EOF
 305   ///
 306   for (;;)
 307   {
 308     // Get rid of any previous token (token factory takes care of
 309     // any de-allocation when this token is finally used up.
 310     //
 311     lexer->rec->state->token = NULL;
 312     lexer->rec->state->error = ANTLR3_FALSE; // Start out without an exception
 313     lexer->rec->state->failed = ANTLR3_FALSE;
 314
 315     // Now call the matching rules and see if we can generate a new token
 316     //
 317     for (;;)
 318     {
 319       // Record the start of the token in our input stream.
 320       //
 321       lexer->rec->state->channel = ANTLR3_TOKEN_DEFAULT_CHANNEL;
 322       lexer->rec->state->tokenStartCharIndex = lexer->input->istream->index(lexer->input->istream);
 323       lexer->rec->state->tokenStartCharPositionInLine = lexer->input->getCharPositionInLine(lexer->input);
 324       lexer->rec->state->tokenStartLine = lexer->input->getLine(lexer->input);
 325       lexer->rec->state->text = NULL;
 326
 327       if (lexer->input->istream->_LA(lexer->input->istream, 1) == ANTLR3_CHARSTREAM_EOF)
 328       {
 329         // Reached the end of the current stream, nothing more to do if this is
 330         // the last in the stack.
 331         //
 332         pANTLR3_COMMON_TOKEN teof = &(toksource->eofToken);
 333
 334         teof->setStartIndex (teof, lexer->getCharIndex(lexer));
 335         teof->setStopIndex (teof, lexer->getCharIndex(lexer));
 336         teof->setLine (teof, lexer->getLine(lexer));
 337         teof->factoryMade = ANTLR3_TRUE; // This isn't really manufactured but it stops things from trying to free it
 338         return teof;
 339       }
 340
 341       lexer->rec->state->token = NULL;
 342       lexer->rec->state->error = ANTLR3_FALSE; // Start out without an exception
 343       lexer->rec->state->failed = ANTLR3_FALSE;
 344
 345       // Call the generated lexer, see if it can get a new token together.
 346       //
 347       lexer->mTokens(lexer->ctx);
 348
 349       if (lexer->rec->state->error == ANTLR3_TRUE)
 350       {
 351         // Recognition exception, report it and try to recover.
 352         //
 353         lexer->rec->state->failed = ANTLR3_TRUE;
 354         // *** CVC4 EDIT: Just call the AntlrInput error routine
 355         lexerError(lexer->rec);
 356         lexer->recover(lexer);
 357       }
 358       else
 359       {
 360         if (lexer->rec->state->token == NULL)
 361         {
 362           // Emit the real token, which adds it in to the token stream basically
 363           //
 364           // *** CVC4 Edit: call emit on the lexer object
 365           lexer->emit(lexer);
 366         }
 367         else if (lexer->rec->state->token == &(toksource->skipToken))
 368         {
 369           // A real token could have been generated, but "Computer say's naaaaah" and it
 370           // it is just something we need to skip altogether.
 371           //
 372           continue;
 373         }
 374
 375         // Good token, not skipped, not EOF token
 376         //
 377         return lexer->rec->state->token;
 378       }
 379     }
 380   }
 381 }
 382
 383 /* *** CVC4 NOTE ***
 384  * This is copied, totaly unmodified, from antlr3lexer.c
 385  * in order to use nextTokenStr previously defined.
 386  *
 387  */
 388 pANTLR3_COMMON_TOKEN
 389 AntlrInput::nextToken       (pANTLR3_TOKEN_SOURCE toksource)
 390 {
 391         pANTLR3_COMMON_TOKEN tok;
 392
 393         // Find the next token in the current stream
 394         //
 395         tok = nextTokenStr(toksource);
 396
 397         // If we got to the EOF token then switch to the previous
 398         // input stream if there were any and just return the
 399         // EOF if there are none. We must check the next token
 400         // in any outstanding input stream we pop into the active
 401         // role to see if it was sitting at EOF after PUSHing the
 402         // stream we just consumed, otherwise we will return EOF
 403         // on the reinstalled input stream, when in actual fact
 404         // there might be more input streams to POP before the
 405         // real EOF of the whole logical inptu stream. Hence we
 406         // use a while loop here until we find somethign in the stream
 407         // that isn't EOF or we reach the actual end of the last input
 408         // stream on the stack.
 409         //
 410         while   (tok->type == ANTLR3_TOKEN_EOF)
 411         {
 412                 pANTLR3_LEXER   lexer;
 413
 414                 lexer   = (pANTLR3_LEXER)(toksource->super);
 415
 416                 if  (lexer->rec->state->streams != NULL && lexer->rec->state->streams->size(lexer->rec->state->streams) > 0)
 417                 {
 418                         // We have another input stream in the stack so we
 419                         // need to revert to it, then resume the loop to check
 420                         // it wasn't sitting at EOF itself.
 421                         //
 422                         lexer->popCharStream(lexer);
 423                         tok = nextTokenStr(toksource);
 424                 }
 425                 else
 426                 {
 427                         // There were no more streams on the input stack
 428                         // so this EOF is the 'real' logical EOF for
 429                         // the input stream. So we just exit the loop and
 430                         // return the EOF we have found.
 431                         //
 432                         break;
 433                 }
 434
 435         }
 436
 437         // return whatever token we have, which may be EOF
 438         //
 439         return  tok;
 440 }
 441
 442
 443
 444 } // namespace parser
 445 } // namespace CVC4