src/parser/antlr_input_imports.cpp

   1 /*********************                                                        */
   2 /*! \file antlr_input_imports.cpp
   3  ** \verbatim
   4  ** Original author: cconway
   5  ** Major contributors: none
   6  ** Minor contributors (to current version): none
   7  ** This file is part of the CVC4 prototype.
   8  ** Copyright (c) 2009, 2010  The Analysis of Computer Systems Group (ACSys)
   9  ** Courant Institute of Mathematical Sciences
  10  ** New York University
  11  ** See the file COPYING in the top-level source directory for licensing
  12  ** information.\endverbatim
  13  **
  14  ** \brief [[ Add one-line brief description here ]]
  15  **
  16  ** [[ Add lengthier description here ]]
  17  ** \todo document this file
  18  **/
  19
  20 /*
  21  * The functions in this file are based on implementations in libantlr3c,
  22  * with only minor CVC4-specific changes.
  23  */
  24
  25 // [The "BSD licence"]
  26 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
  27 // http://www.temporal-wave.com
  28 // http://www.linkedin.com/in/jimidle
  29 //
  30 // All rights reserved.
  31 //
  32 // Redistribution and use in source and binary forms, with or without
  33 // modification, are permitted provided that the following conditions
  34 // are met:
  35 // 1. Redistributions of source code must retain the above copyright
  36 //    notice, this list of conditions and the following disclaimer.
  37 // 2. Redistributions in binary form must reproduce the above copyright
  38 //    notice, this list of conditions and the following disclaimer in the
  39 //    documentation and/or other materials provided with the distribution.
  40 // 3. The name of the author may not be used to endorse or promote products
  41 //    derived from this software without specific prior written permission.
  42 //
  43 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  44 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  45 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  46 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  47 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  48 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  52 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53
  54 #include <antlr3.h>
  55 #include <sstream>
  56
  57 #include "parser/antlr_input.h"
  58 #include "parser/parser.h"
  59 #include "parser/parser_exception.h"
  60 #include "util/Assert.h"
  61
  62 using namespace std;
  63
  64 namespace CVC4 {
  65 namespace parser {
  66
  67 /// Report a recognition problem.
  68 ///
  69 /// This method sets errorRecovery to indicate the parser is recovering
  70 /// not parsing.  Once in recovery mode, no errors are generated.
  71 /// To get out of recovery mode, the parser must successfully match
  72 /// a token (after a resync).  So it will go:
  73 ///
  74 ///             1. error occurs
  75 ///             2. enter recovery mode, report error
  76 ///             3. consume until token found in resynch set
  77 ///             4. try to resume parsing
  78 ///             5. next match() will reset errorRecovery mode
  79 ///
  80 /// If you override, make sure to update errorCount if you care about that.
  81 ///
  82 /* *** CVC4 NOTE ***
  83  * This function is has been modified in not-completely-trivial ways from its
  84  * libantlr3c implementation to support more informative error messages and to
  85  * invoke the error reporting mechanism of the Input class instead of the
  86  * default error printer.
  87  */
  88 void AntlrInput::reportError(pANTLR3_BASE_RECOGNIZER recognizer) {
  89   pANTLR3_EXCEPTION ex = recognizer->state->exception;
  90   pANTLR3_UINT8 * tokenNames = recognizer->state->tokenNames;
  91   stringstream ss;
  92
  93   // Dig the CVC4 objects out of the ANTLR3 mess
  94   pANTLR3_PARSER antlr3Parser = (pANTLR3_PARSER)(recognizer->super);
  95   AlwaysAssert(antlr3Parser!=NULL);
  96   Parser *parser = (Parser*)(antlr3Parser->super);
  97   AlwaysAssert(parser!=NULL);
  98   AntlrInput *input = (AntlrInput*) parser->getInput() ;
  99   AlwaysAssert(input!=NULL);
 100
 101   // Signal we are in error recovery now
 102   recognizer->state->errorRecovery = ANTLR3_TRUE;
 103
 104   // Indicate this recognizer had an error while processing.
 105   recognizer->state->errorCount++;
 106
 107   // Call the builtin error formatter
 108   // recognizer->displayRecognitionError(recognizer, recognizer->state->tokenNames);
 109
 110   /* TODO: Make error messages more useful, maybe by including more expected tokens and information
 111    * about the current token. */
 112   switch(ex->type) {
 113   case ANTLR3_UNWANTED_TOKEN_EXCEPTION:
 114
 115     // Indicates that the recognizer was fed a token which seems to be
 116     // spurious input. We can detect this when the token that follows
 117     // this unwanted token would normally be part of the syntactically
 118     // correct stream. Then we can see that the token we are looking at
 119     // is just something that should not be there and throw this exception.
 120     //
 121     if(tokenNames == NULL) {
 122       ss << "Unexpected token." ;
 123     } else {
 124       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 125         ss << "Expected end of file.";
 126       } else {
 127         ss << "Expected " << tokenNames[ex->expecting]
 128            << ", found '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 129       }
 130     }
 131     break;
 132
 133   case ANTLR3_MISSING_TOKEN_EXCEPTION:
 134
 135     // Indicates that the recognizer detected that the token we just
 136     // hit would be valid syntactically if preceded by a particular
 137     // token. Perhaps a missing ';' at line end or a missing ',' in an
 138     // expression list, and such like.
 139     //
 140     if(tokenNames == NULL) {
 141       ss << "Missing token (" << ex->expecting << ").";
 142     } else {
 143       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 144         ss << "Missing end of file marker.";
 145       } else if( ex->expecting == 0 ) {
 146         ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 147       } else {
 148         ss << "Missing " << tokenNames[ex->expecting] << ".";
 149       }
 150     }
 151     break;
 152
 153   case ANTLR3_RECOGNITION_EXCEPTION:
 154
 155     // Indicates that the recognizer received a token
 156     // in the input that was not predicted. This is the basic exception type
 157     // from which all others are derived. So we assume it was a syntax error.
 158     // You may get this if there are not more tokens and more are needed
 159     // to complete a parse for instance.
 160     //
 161     ss <<"Syntax error.";
 162     break;
 163
 164   case ANTLR3_MISMATCHED_TOKEN_EXCEPTION:
 165
 166     // We were expecting to see one thing and got another. This is the
 167     // most common error if we could not detect a missing or unwanted token.
 168     // Here you can spend your efforts to
 169     // derive more useful error messages based on the expected
 170     // token set and the last token and so on. The error following
 171     // bitmaps do a good job of reducing the set that we were looking
 172     // for down to something small. Knowing what you are parsing may be
 173     // able to allow you to be even more specific about an error.
 174     //
 175     if(tokenNames == NULL) {
 176       ss << "Syntax error.";
 177     } else {
 178       if(ex->expecting == ANTLR3_TOKEN_EOF) {
 179         ss << "Expected end of file.";
 180       } else if( ex->expecting == 0 ) {
 181         ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 182       } else {
 183         ss << "Expected " << tokenNames[ex->expecting] << ".";
 184       }
 185     }
 186     break;
 187
 188   case ANTLR3_NO_VIABLE_ALT_EXCEPTION:
 189     // We could not pick any alt decision from the input given
 190     // so god knows what happened - however when you examine your grammar,
 191     // you should. It means that at the point where the current token occurred
 192     // that the DFA indicates nowhere to go from here.
 193     //
 194     ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 195     break;
 196
 197   case ANTLR3_MISMATCHED_SET_EXCEPTION:
 198
 199   {
 200     ANTLR3_UINT32 count;
 201     ANTLR3_UINT32 bit;
 202     ANTLR3_UINT32 size;
 203     ANTLR3_UINT32 numbits;
 204     pANTLR3_BITSET errBits;
 205
 206     // This means we were able to deal with one of a set of
 207     // possible tokens at this point, but we did not see any
 208     // member of that set.
 209     //
 210     ss << "Unexpected input: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token)
 211        << "'. Expected one of: ";
 212
 213     // What tokens could we have accepted at this point in the
 214     // parse?
 215     //
 216     count = 0;
 217     errBits = antlr3BitsetLoad(ex->expectingSet);
 218     numbits = errBits->numBits(errBits);
 219     size = errBits->size(errBits);
 220
 221     if(size > 0) {
 222       // However many tokens we could have dealt with here, it is usually
 223       // not useful to print ALL of the set here. I arbitrarily chose 8
 224       // here, but you should do whatever makes sense for you of course.
 225       // No token number 0, so look for bit 1 and on.
 226       //
 227       for(bit = 1; bit < numbits && count < 8 && count < size; bit++) {
 228         // TODO: This doesn;t look right - should be asking if the bit is set!!
 229         //
 230         if(tokenNames[bit]) {
 231           if( count++ > 0 ) {
 232             ss << ", ";
 233           }
 234           ss << tokenNames[bit];
 235         }
 236       }
 237     } else {
 238       Unreachable("Parse error with empty set of expected tokens.");
 239     }
 240   }
 241     break;
 242
 243   case ANTLR3_EARLY_EXIT_EXCEPTION:
 244
 245     // We entered a loop requiring a number of token sequences
 246     // but found a token that ended that sequence earlier than
 247     // we should have done.
 248     //
 249     ss << "Sequence terminated early by token: '"
 250        << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
 251     break;
 252
 253   default:
 254
 255     // We don't handle any other exceptions here, but you can
 256     // if you wish. If we get an exception that hits this point
 257     // then we are just going to report what we know about the
 258     // token.
 259     //
 260     Unhandled("Unexpected exception in parser.");
 261     break;
 262   }
 263
 264   // Call the error display routine
 265   input->parseError(ss.str());
 266 }
 267
 268 ///
 269 /// \brief
 270 /// Returns the next available token from the current input stream.
 271 ///
 272 /// \param toksource
 273 /// Points to the implementation of a token source. The lexer is
 274 /// addressed by the super structure pointer.
 275 ///
 276 /// \returns
 277 /// The next token in the current input stream or the EOF token
 278 /// if there are no more tokens.
 279 ///
 280 /// \remarks
 281 /// Write remarks for nextToken here.
 282 ///
 283 /// \see nextToken
 284 ///
 285 /* *** CVC4 NOTE ***
 286  * This is copied, largely unmodified, from antlr3lexer.c
 287  *
 288  */
 289 pANTLR3_COMMON_TOKEN
 290 AntlrInput::nextTokenStr (pANTLR3_TOKEN_SOURCE toksource)
 291 {
 292   pANTLR3_LEXER lexer;
 293
 294   lexer = (pANTLR3_LEXER)(toksource->super);
 295
 296   /// Loop until we get a non skipped token or EOF
 297   ///
 298   for (;;)
 299   {
 300     // Get rid of any previous token (token factory takes care of
 301     // any de-allocation when this token is finally used up.
 302     //
 303     lexer->rec->state->token = NULL;
 304     lexer->rec->state->error = ANTLR3_FALSE; // Start out without an exception
 305     lexer->rec->state->failed = ANTLR3_FALSE;
 306
 307     // Now call the matching rules and see if we can generate a new token
 308     //
 309     for (;;)
 310     {
 311       // Record the start of the token in our input stream.
 312       //
 313       lexer->rec->state->channel = ANTLR3_TOKEN_DEFAULT_CHANNEL;
 314       lexer->rec->state->tokenStartCharIndex = lexer->input->istream->index(lexer->input->istream);
 315       lexer->rec->state->tokenStartCharPositionInLine = lexer->input->getCharPositionInLine(lexer->input);
 316       lexer->rec->state->tokenStartLine = lexer->input->getLine(lexer->input);
 317       lexer->rec->state->text = NULL;
 318
 319       if (lexer->input->istream->_LA(lexer->input->istream, 1) == ANTLR3_CHARSTREAM_EOF)
 320       {
 321         // Reached the end of the current stream, nothing more to do if this is
 322         // the last in the stack.
 323         //
 324         pANTLR3_COMMON_TOKEN teof = &(toksource->eofToken);
 325
 326         teof->setStartIndex (teof, lexer->getCharIndex(lexer));
 327         teof->setStopIndex (teof, lexer->getCharIndex(lexer));
 328         teof->setLine (teof, lexer->getLine(lexer));
 329         teof->factoryMade = ANTLR3_TRUE; // This isn't really manufactured but it stops things from trying to free it
 330         return teof;
 331       }
 332
 333       lexer->rec->state->token = NULL;
 334       lexer->rec->state->error = ANTLR3_FALSE; // Start out without an exception
 335       lexer->rec->state->failed = ANTLR3_FALSE;
 336
 337       // Call the generated lexer, see if it can get a new token together.
 338       //
 339       lexer->mTokens(lexer->ctx);
 340
 341       if (lexer->rec->state->error == ANTLR3_TRUE)
 342       {
 343         // Recognition exception, report it and try to recover.
 344         //
 345         lexer->rec->state->failed = ANTLR3_TRUE;
 346         // *** CVC4 EDIT: Just call the AntlrInput error routine
 347         lexerError(lexer->rec);
 348         lexer->recover(lexer);
 349       }
 350       else
 351       {
 352         if (lexer->rec->state->token == NULL)
 353         {
 354           // Emit the real token, which adds it in to the token stream basically
 355           //
 356           // *** CVC4 Edit: call emit on the lexer object
 357           lexer->emit(lexer);
 358         }
 359         else if (lexer->rec->state->token == &(toksource->skipToken))
 360         {
 361           // A real token could have been generated, but "Computer say's naaaaah" and it
 362           // it is just something we need to skip altogether.
 363           //
 364           continue;
 365         }
 366
 367         // Good token, not skipped, not EOF token
 368         //
 369         return lexer->rec->state->token;
 370       }
 371     }
 372   }
 373 }
 374
 375 } // namespace parser
 376 } // namespace CVC4