Merge branch '1.4.x'
[cvc5.git] / src / parser / antlr_input.cpp
1 /********************* */
2 /*! \file antlr_input.cpp
3 ** \verbatim
4 ** Original author: Christopher L. Conway
5 ** Major contributors: Morgan Deters, Kshitij Bansal
6 ** Minor contributors (to current version): Francois Bobot
7 ** This file is part of the CVC4 project.
8 ** Copyright (c) 2009-2014 New York University and The University of Iowa
9 ** See the file COPYING in the top-level source directory for licensing
10 ** information.\endverbatim
11 **
12 ** \brief A super-class for ANTLR-generated input language parsers.
13 **
14 ** A super-class for ANTLR-generated input language parsers
15 **/
16
17 #include <limits.h>
18 #include <antlr3.h>
19 #include <stdint.h>
20
21 #include "parser/antlr_input.h"
22 #include "parser/input.h"
23 #include "parser/bounded_token_buffer.h"
24 #include "parser/bounded_token_factory.h"
25 #include "parser/antlr_line_buffered_input.h"
26 #include "parser/memory_mapped_input_buffer.h"
27 #include "parser/parser_exception.h"
28 #include "parser/parser.h"
29
30 #include "expr/command.h"
31 #include "expr/type.h"
32 #include "parser/cvc/cvc_input.h"
33 #include "parser/smt1/smt1_input.h"
34 #include "parser/smt2/smt2_input.h"
35 #include "parser/tptp/tptp_input.h"
36 #include "util/output.h"
37
38 using namespace std;
39 using namespace CVC4;
40 using namespace CVC4::parser;
41 using namespace CVC4::kind;
42
43 namespace CVC4 {
44 namespace parser {
45
46 AntlrInputStream::AntlrInputStream(std::string name,
47 pANTLR3_INPUT_STREAM input,
48 bool fileIsTemporary) :
49 InputStream(name, fileIsTemporary),
50 d_input(input) {
51 assert( input != NULL );
52 input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
53 }
54
55 AntlrInputStream::~AntlrInputStream() {
56 d_input->free(d_input);
57 }
58
59 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
60 return d_input;
61 }
62
63 AntlrInputStream*
64 AntlrInputStream::newFileInputStream(const std::string& name,
65 bool useMmap)
66 throw (InputStreamException) {
67 #ifdef _WIN32
68 if(useMmap) {
69 useMmap = false;
70 }
71 #endif
72 pANTLR3_INPUT_STREAM input = NULL;
73 if(useMmap) {
74 input = MemoryMappedInputBufferNew(name);
75 } else {
76 // libantlr3c v3.2 isn't source-compatible with v3.4
77 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
78 input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) name.c_str());
79 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
80 input = antlr3FileStreamNew((pANTLR3_UINT8) name.c_str(), ANTLR3_ENC_8BIT);
81 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
82 }
83 if(input == NULL) {
84 throw InputStreamException("Couldn't open file: " + name);
85 }
86 return new AntlrInputStream( name, input );
87 }
88
89 AntlrInputStream*
90 AntlrInputStream::newStreamInputStream(std::istream& input,
91 const std::string& name,
92 bool lineBuffered)
93 throw (InputStreamException) {
94
95 pANTLR3_INPUT_STREAM inputStream = NULL;
96
97 if(lineBuffered) {
98 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
99 inputStream =
100 antlr3LineBufferedStreamNew(input,
101 0,
102 (pANTLR3_UINT8) strdup(name.c_str()));
103 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
104 inputStream =
105 antlr3LineBufferedStreamNew(input,
106 ANTLR3_ENC_8BIT,
107 (pANTLR3_UINT8) strdup(name.c_str()));
108 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
109 } else {
110
111 // Since these are all NULL on entry, realloc will be called
112 char *basep = NULL, *boundp = NULL, *cp = NULL;
113 /* 64KB seems like a reasonable default size. */
114 size_t bufSize = 0x10000;
115
116 /* Keep going until we can't go no more. */
117 while( !input.eof() && !input.fail() ) {
118
119 if( cp == boundp ) {
120 /* We ran out of room in the buffer. Realloc at double the size. */
121 ptrdiff_t offset = cp - basep;
122 basep = (char *) realloc(basep, bufSize);
123 if( basep == NULL ) {
124 throw InputStreamException("Failed buffering input stream: " + name);
125 }
126 cp = basep + offset;
127 boundp = basep + bufSize;
128 bufSize *= 2;
129 }
130
131 /* Read as much as we have room for. */
132 input.read( cp, boundp - cp );
133 cp += input.gcount();
134 }
135
136 /* Make sure the fail bit didn't get set. */
137 if( !input.eof() ) {
138 throw InputStreamException("Stream input failed: " + name);
139 }
140
141 /* Create an ANTLR input backed by the buffer. */
142 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
143 inputStream =
144 antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) basep,
145 cp - basep,
146 (pANTLR3_UINT8) strdup(name.c_str()));
147 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
148 inputStream =
149 antlr3StringStreamNew((pANTLR3_UINT8) basep,
150 ANTLR3_ENC_8BIT,
151 cp - basep,
152 (pANTLR3_UINT8) strdup(name.c_str()));
153 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
154
155 }
156
157 if( inputStream == NULL ) {
158 throw InputStreamException("Couldn't initialize input: " + name);
159 }
160
161 return new AntlrInputStream( name, inputStream );
162 }
163
164
165 AntlrInputStream*
166 AntlrInputStream::newStringInputStream(const std::string& input,
167 const std::string& name)
168 throw (InputStreamException) {
169 char* inputStr = strdup(input.c_str());
170 char* nameStr = strdup(name.c_str());
171 assert( inputStr!=NULL && nameStr!=NULL );
172 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
173 pANTLR3_INPUT_STREAM inputStream =
174 antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) inputStr,
175 input.size(),
176 (pANTLR3_UINT8) nameStr);
177 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
178 pANTLR3_INPUT_STREAM inputStream =
179 antlr3StringStreamNew((pANTLR3_UINT8) inputStr,
180 ANTLR3_ENC_8BIT,
181 input.size(),
182 (pANTLR3_UINT8) nameStr);
183 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
184 if( inputStream==NULL ) {
185 throw InputStreamException("Couldn't initialize string input: '" + input + "'");
186 }
187 return new AntlrInputStream( name, inputStream );
188 }
189
190 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
191 using namespace language::input;
192
193 AntlrInput* input;
194
195 switch(lang) {
196 case LANG_CVC4: {
197 input = new CvcInput(inputStream);
198 break;
199 }
200 case LANG_SMTLIB_V1:
201 input = new Smt1Input(inputStream);
202 break;
203
204 case LANG_SMTLIB_V2_0:
205 case LANG_SMTLIB_V2_5:
206 input = new Smt2Input(inputStream, lang);
207 break;
208
209 case LANG_TPTP:
210 input = new TptpInput(inputStream);
211 break;
212
213 default:
214 std::stringstream ss;
215 ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
216 throw InputStreamException(ss.str());
217 }
218
219 return input;
220 }
221
222 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
223 Input(inputStream),
224 d_lookahead(lookahead),
225 d_lexer(NULL),
226 d_parser(NULL),
227 d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
228 d_tokenBuffer(NULL) {
229 }
230
231 /*
232 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
233 Parser(exprManager,name),
234 d_lookahead(lookahead) {
235
236 }
237 */
238
239 /*
240 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
241 Input(exprManager,name),
242 d_lookahead(lookahead),
243 d_lexer(NULL),
244 d_parser(NULL),
245 d_tokenStream(NULL) {
246
247 char* inputStr = strdup(input.c_str());
248 char* nameStr = strdup(name.c_str());
249 if( inputStr==NULL || nameStr==NULL ) {
250 throw ParserException("Couldn't initialize string input: '" + input + "'");
251 }
252 d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
253 if( d_inputStream == NULL ) {
254 throw ParserException("Couldn't create input stream for string: '" + input + "'");
255 }
256
257 }
258 */
259
260 AntlrInput::~AntlrInput() {
261 BoundedTokenBufferFree(d_tokenBuffer);
262 }
263
264 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
265 return d_tokenBuffer->commonTstream;
266 }
267
268 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
269 pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
270 assert(lexer!=NULL);
271 Parser *parser = (Parser*)(lexer->super);
272 assert(parser!=NULL);
273 AntlrInput *input = (AntlrInput*) parser->getInput();
274 assert(input!=NULL);
275
276 /* Call the error display routine *if* there's not already a
277 * parse error pending. If a parser error is pending, this
278 * error is probably less important, so we just drop it. */
279 if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
280 input->parseError("Error finding next token.");
281 }
282 }
283
284 void AntlrInput::warning(const std::string& message) {
285 Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
286 }
287
288
289 /**
290 * characters considered part of a simple symbol in SMTLIB.
291 *
292 * TODO: Ideally this code shouldn't be smtlib specific (should work
293 * with CVC language too), but this per-language specialization has
294 * been left for a later point.
295 */
296 inline bool isSimpleChar(char ch) {
297 return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
298 }
299
300 size_t wholeWordMatch(string input, string pattern, bool (*isWordChar)(char)) {
301 size_t st = 0;
302 size_t N = input.size();
303 while(st < N) {
304 while( st < N && (*isWordChar)(input[st]) == false ) st++;
305 size_t en = st;
306 while(en + 1 < N && (*isWordChar)(input[en + 1]) == true) en++;
307 if(en - st + 1 == pattern.size()) {
308 bool match = true;
309 for(size_t i = 0; match && i < pattern.size(); ++i) {
310 match &= (pattern[i] == input[st+i]);
311 }
312 if(match == true) {
313 return st;
314 }
315 }
316 st = en + 1;
317 }
318 return string::npos;
319 }
320
321 /**
322 * Gets part of original input and tries to visually hint where the
323 * error might be.
324 *
325 * Something like:
326 *
327 * ...nd (= alpha beta) (= beta delta))
328 * ^
329 *
330 * Implementation (as on 2014/04/24):
331 *
332 * > if suggested pointer by lexer is under a "simple char", move to
333 * start of the word and print pointer there.
334 *
335 * > in the other case, it tries to find the nearest word in the error
336 * message passed along. if it can't find it, we don't add this
337 * visual hint, as experimentally position suggested by lexer was
338 * found to be totally unhelpful. (TODO: fix this upstream to
339 * improve)
340 */
341 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
342 {
343 // Is it a multi-line message
344 bool multilineMessage = (message.find('\n') != string::npos);
345 // Useful only if it is a multi-line message
346 int firstLineEnd = message.find('\n');
347
348 std::ostringstream ss, slicess;
349
350 // Keep first line of message
351 if(multilineMessage) {
352 ss << message.substr(0, firstLineEnd) << endl << endl;
353 } else {
354 ss << message << endl << endl;
355 }
356
357 int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
358 int posSliceEnd = posSliceStart + 70;
359 int caretPos = 0;
360 int caretPosExtra = 0; // for inital intendation, epilipses etc.
361
362 ss << " "; caretPosExtra += 2;
363 if(posSliceStart > 0) {
364 ss << "..."; caretPosExtra += 3;
365 }
366
367 for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
368 if(i == posSliceEnd) {
369 ss << "...";
370 break;
371 }
372 if(i < charPositionInLine) { caretPos++; }
373
374 if(!isprint(lineStart[i])) {
375 // non-printable character, something wrong, bail out
376 return message;
377 }
378
379 ss << (lineStart[i]);
380 slicess << (lineStart[i]);
381 }
382
383 // adjust position of caret, based on slice and message
384 {
385 int caretPosOrig = caretPos;
386 string slice = slicess.str();
387 if(isSimpleChar(slice[caretPos])) {
388 // if alphanumeric, try to go to beginning of word/number
389 while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
390 if(caretPos == 0 && posSliceStart > 0) {
391 // reached start and this is not really the start? bail out
392 return message;
393 } else {
394 // likely it is also in original message? if so, very likely
395 // we found the right place
396 string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
397 size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
398 Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
399 if( matchLoc != string::npos ) {
400 Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
401 }
402 }
403 } else {
404 bool foundCaretPos = false;
405
406 for(int tries = 0; tries < 2 && caretPos > 0 && !foundCaretPos; ++tries) {
407 // go to nearest alphanumeric string (before current position),
408 // see if that word can be found in original message. If so,
409 // point to that, else keep pointer where it was.
410 int nearestWordEn = caretPos - 1;
411 while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
412 --nearestWordEn;
413 }
414 if(isSimpleChar(slice[nearestWordEn])) {
415 int nearestWordSt = nearestWordEn;
416 while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
417 --nearestWordSt;
418 }
419 string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
420 size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
421 Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
422 Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
423 if( matchLoc != string::npos ) {
424 Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
425 << nearestWordSt << std::endl;
426 foundCaretPos = true;
427 }
428 caretPos = nearestWordSt;
429 }
430 }
431 if( !foundCaretPos) {
432 // this doesn't look good. caret generally getting printed
433 // at unhelpful positions. improve upstream?
434 return message;
435 }
436 }
437 caretPos += caretPosExtra;
438 }// end of caret position computation/heuristics
439
440 ss << std::endl;
441 while( caretPos-- > 0 ) {
442 ss << ' ';
443 }
444 ss << '^' << endl;
445 if(multilineMessage) {
446 ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
447 }
448 return ss.str();
449 }
450
451 void AntlrInput::parseError(const std::string& message, bool eofException)
452 throw (ParserException) {
453
454 string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
455 d_lexer->getCharPositionInLine(d_lexer),
456 message);
457
458 Debug("parser") << "Throwing exception: "
459 << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
460 << d_lexer->getLine(d_lexer) << "."
461 << d_lexer->getCharPositionInLine(d_lexer) << ": "
462 << updatedMessage << endl;
463 if(eofException) {
464 throw ParserEndOfFileException(message,
465 (const char*)d_lexer->rec->state->tokSource->fileName->chars,
466 d_lexer->getLine(d_lexer),
467 d_lexer->getCharPositionInLine(d_lexer));
468 } else {
469 throw ParserException(updatedMessage,
470 (const char*)d_lexer->rec->state->tokSource->fileName->chars,
471 d_lexer->getLine(d_lexer),
472 d_lexer->getCharPositionInLine(d_lexer));
473 }
474 }
475
476
477 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
478 d_lexer = pLexer;
479
480 pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
481 if( pTokenFactory != NULL ) {
482 pTokenFactory->close(pTokenFactory);
483 }
484
485 /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
486 pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
487 if( pTokenFactory == NULL ) {
488 throw InputStreamException("Couldn't create token factory.");
489 }
490 d_lexer->rec->state->tokFactory = pTokenFactory;
491
492 pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
493 if( buffer == NULL ) {
494 throw InputStreamException("Couldn't create token buffer.");
495 }
496
497 d_tokenBuffer = buffer;
498
499 // Override default lexer error reporting
500 d_lexer->rec->reportError = &lexerError;
501 // Override default nextToken function, just to prevent exceptions escaping.
502 d_lexer->rec->state->tokSource->nextToken = &nextToken;
503 }
504
505 void AntlrInput::setParser(Parser& parser) {
506 // ANTLR isn't using super in the lexer or the parser, AFAICT.
507 // We could also use @lexer/parser::context to add a field to the generated
508 // objects, but then it would have to be declared separately in every
509 // language's grammar and we'd have to in the address of the field anyway.
510 d_lexer->super = &parser;
511 d_parser->super = &parser;
512 }
513
514 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
515 d_parser = pParser;
516 // d_parser->rec->match = &match;
517 d_parser->rec->reportError = &reportError;
518 /* Don't try to recover from a parse error. */
519 // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
520 d_parser->rec->recoverFromMismatchedToken =
521 (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
522 d_parser->rec->mismatch;
523 }
524
525 }/* CVC4::parser namespace */
526 }/* CVC4 namespace */