Adding match override to AntlrInput, in attempt to workaround Bug #76
[cvc5.git] / src / parser / antlr_input.cpp
1 /********************* */
2 /** antlr_input.cpp
3 ** Original author: cconway
4 ** Major contributors: none
5 ** Minor contributors (to current version): none
6 ** This file is part of the CVC4 prototype.
7 ** Copyright (c) 2009, 2010 The Analysis of Computer Systems Group (ACSys)
8 ** Courant Institute of Mathematical Sciences
9 ** New York University
10 ** See the file COPYING in the top-level source directory for licensing
11 ** information.
12 **
13 ** A super-class for ANTLR-generated input language parsers
14 **/
15
16 #include <iostream>
17 #include <limits.h>
18 #include <antlr3.h>
19
20 #include "antlr_input.h"
21 #include "bounded_token_buffer.h"
22 #include "bounded_token_factory.h"
23 #include "memory_mapped_input_buffer.h"
24 #include "parser_exception.h"
25 #include "parser_state.h"
26
27 #include "util/output.h"
28 #include "util/Assert.h"
29 #include "expr/command.h"
30 #include "expr/type.h"
31
32 using namespace std;
33 using namespace CVC4;
34 using namespace CVC4::parser;
35 using namespace CVC4::kind;
36
37 namespace CVC4 {
38 namespace parser {
39
40 AntlrInput::AntlrInput(ExprManager* exprManager, const std::string& filename, unsigned int lookahead, bool useMmap) :
41 Input(exprManager, filename),
42 d_lookahead(lookahead),
43 d_lexer(NULL),
44 d_parser(NULL),
45 d_tokenStream(NULL) {
46
47 if( useMmap ) {
48 d_input = MemoryMappedInputBufferNew(filename);
49 } else {
50 d_input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) filename.c_str());
51 }
52 if( d_input == NULL ) {
53 throw ParserException("Couldn't open file: " + filename);
54 }
55 }
56
57 /*
58 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
59 Parser(exprManager,name),
60 d_lookahead(lookahead) {
61
62 }
63 */
64
65 AntlrInput::AntlrInput(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
66 Input(exprManager,name),
67 d_lookahead(lookahead),
68 d_lexer(NULL),
69 d_parser(NULL),
70 d_tokenStream(NULL) {
71 char* inputStr = strdup(input.c_str());
72 char* nameStr = strdup(name.c_str());
73 if( inputStr==NULL || nameStr==NULL ) {
74 throw ParserException("Couldn't initialize string input: '" + input + "'");
75 }
76 d_input = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
77 if( d_input == NULL ) {
78 throw ParserException("Couldn't create input stream for string: '" + input + "'");
79 }
80 }
81
82 AntlrInput::~AntlrInput() {
83 d_tokenStream->free(d_tokenStream);
84 d_input->close(d_input);
85 }
86
87 pANTLR3_INPUT_STREAM AntlrInput::getInputStream() {
88 return d_input;
89 }
90
91 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
92 return d_tokenStream;
93 }
94
95
96 /// Match current input symbol against ttype. Upon error, do one token
97 /// insertion or deletion if possible.
98 /// To turn off single token insertion or deletion error
99 /// recovery, override mismatchRecover() and have it call
100 /// plain mismatch(), which does not recover. Then any error
101 /// in a rule will cause an exception and immediate exit from
102 /// rule. Rule would recover by resynchronizing to the set of
103 /// symbols that can follow rule ref.
104 ///
105 // [chris 4/5/2010] Copy and paste from antlr3baserecognizer.c
106 void *
107 AntlrInput::match(pANTLR3_BASE_RECOGNIZER recognizer, ANTLR3_UINT32 ttype,
108 pANTLR3_BITSET_LIST follow) {
109 pANTLR3_PARSER parser;
110 pANTLR3_TREE_PARSER tparser;
111 pANTLR3_INT_STREAM is;
112 void * matchedSymbol;
113
114 switch(recognizer->type) {
115 case ANTLR3_TYPE_PARSER:
116
117 parser = (pANTLR3_PARSER)(recognizer->super);
118 tparser = NULL;
119 is = parser->tstream->istream;
120
121 break;
122
123 case ANTLR3_TYPE_TREE_PARSER:
124
125 tparser = (pANTLR3_TREE_PARSER)(recognizer->super);
126 parser = NULL;
127 is = tparser->ctnstream->tnstream->istream;
128
129 break;
130
131 default:
132
133 ANTLR3_FPRINTF(
134 stderr,
135 "Base recognizer function 'match' called by unknown parser type - provide override for this function\n");
136 return ANTLR3_FALSE;
137
138 break;
139 }
140
141 // Pick up the current input token/node for assignment to labels
142 //
143 matchedSymbol = recognizer->getCurrentInputSymbol(recognizer, is);
144
145 if(is->_LA(is, 1) == ttype) {
146 // The token was the one we were told to expect
147 //
148 is->consume(is); // Consume that token from the stream
149 recognizer->state->errorRecovery = ANTLR3_FALSE; // Not in error recovery now (if we were)
150 recognizer->state->failed = ANTLR3_FALSE; // The match was a success
151 return matchedSymbol; // We are done
152 }
153
154 // We did not find the expected token type, if we are backtracking then
155 // we just set the failed flag and return.
156 //
157 if(recognizer->state->backtracking > 0) {
158 // Backtracking is going on
159 //
160 recognizer->state->failed = ANTLR3_TRUE;
161 return matchedSymbol;
162 }
163
164 // We did not find the expected token and there is no backtracking
165 // going on, so we mismatch, which creates an exception in the recognizer exception
166 // stack.
167 //
168 matchedSymbol = recognizer->recoverFromMismatchedToken(recognizer, ttype,
169 follow);
170 return matchedSymbol;
171 }
172
173 void AntlrInput::parseError(const std::string& message)
174 throw (ParserException) {
175 Debug("parser") << "Throwing exception: "
176 << getParserState()->getFilename() << ":"
177 << d_lexer->getLine(d_lexer) << "."
178 << d_lexer->getCharPositionInLine(d_lexer) << ": "
179 << message << endl;
180 throw ParserException(message, getParserState()->getFilename(),
181 d_lexer->getLine(d_lexer),
182 d_lexer->getCharPositionInLine(d_lexer));
183 }
184
185 void *
186 AntlrInput::recoverFromMismatchedToken(pANTLR3_BASE_RECOGNIZER recognizer,
187 ANTLR3_UINT32 ttype,
188 pANTLR3_BITSET_LIST follow) {
189
190 pANTLR3_PARSER parser = (pANTLR3_PARSER) (recognizer->super);
191 pANTLR3_INT_STREAM is = parser->tstream->istream;
192 void *matchedSymbol;
193
194
195 // Create an exception if we need one
196 //
197 if(recognizer->state->exception == NULL) {
198 antlr3RecognitionExceptionNew(recognizer);
199 }
200
201 if(recognizer->mismatchIsUnwantedToken(recognizer, is, ttype) == ANTLR3_TRUE) {
202 recognizer->state->exception->type = ANTLR3_UNWANTED_TOKEN_EXCEPTION;
203 recognizer->state->exception->message
204 = (void*)ANTLR3_UNWANTED_TOKEN_EXCEPTION_NAME;
205 }
206
207 if(recognizer->mismatchIsMissingToken(recognizer, is, follow)) {
208 matchedSymbol = recognizer->getMissingSymbol(recognizer, is,
209 recognizer->state->exception,
210 ttype, follow);
211 recognizer->state->exception->type = ANTLR3_MISSING_TOKEN_EXCEPTION;
212 recognizer->state->exception->message = (void*)ANTLR3_MISSING_TOKEN_EXCEPTION_NAME;
213 recognizer->state->exception->token = matchedSymbol;
214 recognizer->state->exception->expecting = ttype;
215 }
216
217 reportError(recognizer);
218 Unreachable("reportError should have thrown exception in AntlrInput::recoverFromMismatchedToken");
219 }
220
221 void AntlrInput::reportError(pANTLR3_BASE_RECOGNIZER recognizer) {
222 pANTLR3_EXCEPTION ex = recognizer->state->exception;
223 pANTLR3_UINT8 * tokenNames = recognizer->state->tokenNames;
224 stringstream ss;
225 // std::string msg;
226
227 // Signal we are in error recovery now
228 recognizer->state->errorRecovery = ANTLR3_TRUE;
229
230 // Indicate this recognizer had an error while processing.
231 recognizer->state->errorCount++;
232
233 // Call the builtin error formatter
234 // recognizer->displayRecognitionError(recognizer, recognizer->state->tokenNames);
235
236 /* This switch statement is adapted from antlr3baserecognizer.c:displayRecognitionError in libantlr3c.
237 * TODO: Make error messages more useful, maybe by including more expected tokens and information
238 * about the current token. */
239 switch(ex->type) {
240 case ANTLR3_UNWANTED_TOKEN_EXCEPTION:
241
242 // Indicates that the recognizer was fed a token which seems to be
243 // spurious input. We can detect this when the token that follows
244 // this unwanted token would normally be part of the syntactically
245 // correct stream. Then we can see that the token we are looking at
246 // is just something that should not be there and throw this exception.
247 //
248 if(tokenNames == NULL) {
249 ss << "Unexpected token." ;
250 } else {
251 if(ex->expecting == ANTLR3_TOKEN_EOF) {
252 ss << "Expected end of file.";
253 } else {
254 ss << "Expected " << tokenNames[ex->expecting]
255 << ", found '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
256 }
257 }
258 break;
259
260 case ANTLR3_MISSING_TOKEN_EXCEPTION:
261
262 // Indicates that the recognizer detected that the token we just
263 // hit would be valid syntactically if preceded by a particular
264 // token. Perhaps a missing ';' at line end or a missing ',' in an
265 // expression list, and such like.
266 //
267 if(tokenNames == NULL) {
268 ss << "Missing token (" << ex->expecting << ").";
269 } else {
270 if(ex->expecting == ANTLR3_TOKEN_EOF) {
271 ss << "Missing end of file marker.";
272 } else {
273 ss << "Missing " << tokenNames[ex->expecting] << ".";
274 }
275 }
276 break;
277
278 case ANTLR3_RECOGNITION_EXCEPTION:
279
280 // Indicates that the recognizer received a token
281 // in the input that was not predicted. This is the basic exception type
282 // from which all others are derived. So we assume it was a syntax error.
283 // You may get this if there are not more tokens and more are needed
284 // to complete a parse for instance.
285 //
286 ss <<"Syntax error.";
287 break;
288
289 case ANTLR3_MISMATCHED_TOKEN_EXCEPTION:
290
291 // We were expecting to see one thing and got another. This is the
292 // most common error if we could not detect a missing or unwanted token.
293 // Here you can spend your efforts to
294 // derive more useful error messages based on the expected
295 // token set and the last token and so on. The error following
296 // bitmaps do a good job of reducing the set that we were looking
297 // for down to something small. Knowing what you are parsing may be
298 // able to allow you to be even more specific about an error.
299 //
300 if(tokenNames == NULL) {
301 ss << "Syntax error.";
302 } else {
303 if(ex->expecting == ANTLR3_TOKEN_EOF) {
304 ss << "Expected end of file.";
305 } else {
306 ss << "Expected " << tokenNames[ex->expecting] << ".";
307 }
308 }
309 break;
310
311 case ANTLR3_NO_VIABLE_ALT_EXCEPTION:
312 // We could not pick any alt decision from the input given
313 // so god knows what happened - however when you examine your grammar,
314 // you should. It means that at the point where the current token occurred
315 // that the DFA indicates nowhere to go from here.
316 //
317 ss << "Unexpected token: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
318 break;
319
320 case ANTLR3_MISMATCHED_SET_EXCEPTION:
321
322 {
323 ANTLR3_UINT32 count;
324 ANTLR3_UINT32 bit;
325 ANTLR3_UINT32 size;
326 ANTLR3_UINT32 numbits;
327 pANTLR3_BITSET errBits;
328
329 // This means we were able to deal with one of a set of
330 // possible tokens at this point, but we did not see any
331 // member of that set.
332 //
333 ss << "Unexpected input: '" << tokenText((pANTLR3_COMMON_TOKEN)ex->token)
334 << "'. Expected one of: ";
335
336 // What tokens could we have accepted at this point in the
337 // parse?
338 //
339 count = 0;
340 errBits = antlr3BitsetLoad(ex->expectingSet);
341 numbits = errBits->numBits(errBits);
342 size = errBits->size(errBits);
343
344 if(size > 0) {
345 // However many tokens we could have dealt with here, it is usually
346 // not useful to print ALL of the set here. I arbitrarily chose 8
347 // here, but you should do whatever makes sense for you of course.
348 // No token number 0, so look for bit 1 and on.
349 //
350 for(bit = 1; bit < numbits && count < 8 && count < size; bit++) {
351 // TODO: This doesn;t look right - should be asking if the bit is set!!
352 //
353 if(tokenNames[bit]) {
354 if( count++ > 0 ) {
355 ss << ", ";
356 }
357 ss << tokenNames[bit];
358 }
359 }
360 } else {
361 Unreachable("Parse error with empty set of expected tokens.");
362 }
363 }
364 break;
365
366 case ANTLR3_EARLY_EXIT_EXCEPTION:
367
368 // We entered a loop requiring a number of token sequences
369 // but found a token that ended that sequence earlier than
370 // we should have done.
371 //
372 ss << "Sequence terminated early by token: '"
373 << tokenText((pANTLR3_COMMON_TOKEN)ex->token) << "'.";
374 break;
375
376 default:
377
378 // We don't handle any other exceptions here, but you can
379 // if you wish. If we get an exception that hits this point
380 // then we are just going to report what we know about the
381 // token.
382 //
383 Unhandled("Unexpected exception in parser.");
384 break;
385 }
386
387 // Now get ready to throw an exception
388 pANTLR3_PARSER parser = (pANTLR3_PARSER)(recognizer->super);
389 AlwaysAssert(parser!=NULL);
390 ParserState *parserState = (ParserState*)(parser->super);
391 AlwaysAssert(parserState!=NULL);
392
393 // Call the error display routine
394 parserState->parseError(ss.str());
395 }
396
397 void AntlrInput::setLexer(pANTLR3_LEXER pLexer) {
398 d_lexer = pLexer;
399
400 pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
401 if( pTokenFactory != NULL ) {
402 pTokenFactory->close(pTokenFactory);
403 }
404
405 /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
406 pTokenFactory = BoundedTokenFactoryNew(d_input, 2*d_lookahead);
407 if( pTokenFactory == NULL ) {
408 throw ParserException("Couldn't create token factory.");
409 }
410 d_lexer->rec->state->tokFactory = pTokenFactory;
411
412 pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
413 if( buffer == NULL ) {
414 throw ParserException("Couldn't create token buffer.");
415 }
416
417 d_tokenStream = buffer->commonTstream;
418 }
419
420 void AntlrInput::setParser(pANTLR3_PARSER pParser) {
421 d_parser = pParser;
422 // ANTLR isn't using super, AFAICT.
423 // We could also use @parser::context to add a field to the generated parser, but then
424 // it would have to be declared separately in every input's grammar and we'd have to
425 // pass it in as an address anyway.
426 d_parser->super = getParserState();
427 d_parser->rec->match = &match;
428 d_parser->rec->reportError = &reportError;
429 d_parser->rec->recoverFromMismatchedToken = &recoverFromMismatchedToken;
430 }
431
432
433 }/* CVC4::parser namespace */
434 }/* CVC4 namespace */