6428017f5c4f3078a8ebc660ce203b8fdd423bce
[cvc5.git] / src / parser / antlr_input.cpp
1 /********************* */
2 /*! \file antlr_input.cpp
3 ** \verbatim
4 ** Original author: Christopher L. Conway
5 ** Major contributors: Morgan Deters, Kshitij Bansal
6 ** Minor contributors (to current version): Francois Bobot
7 ** This file is part of the CVC4 project.
8 ** Copyright (c) 2009-2014 New York University and The University of Iowa
9 ** See the file COPYING in the top-level source directory for licensing
10 ** information.\endverbatim
11 **
12 ** \brief A super-class for ANTLR-generated input language parsers.
13 **
14 ** A super-class for ANTLR-generated input language parsers
15 **/
16
17 #include "parser/antlr_input.h"
18 // We rely on the inclusion of #include <antlr3.h> in "parser/antlr_input.h".
19 // This is avoid having to undefine the symbols in <antlr3.h>.
20 // See the documentation in "parser/antlr_undefines.h" for more
21 // details.
22
23 #include <limits.h>
24 #include <stdint.h>
25
26 #include "base/output.h"
27 #include "expr/type.h"
28 #include "parser/antlr_line_buffered_input.h"
29 #include "parser/bounded_token_buffer.h"
30 #include "parser/bounded_token_factory.h"
31 #include "parser/cvc/cvc_input.h"
32 #include "parser/input.h"
33 #include "parser/memory_mapped_input_buffer.h"
34 #include "parser/parser.h"
35 #include "parser/parser_exception.h"
36 #include "parser/smt1/smt1_input.h"
37 #include "parser/smt2/smt2_input.h"
38 #include "parser/smt2/sygus_input.h"
39 #include "parser/tptp/tptp_input.h"
40 #include "smt_util/command.h"
41
42 using namespace std;
43 using namespace CVC4;
44 using namespace CVC4::parser;
45 using namespace CVC4::kind;
46
47 namespace CVC4 {
48 namespace parser {
49
50 // These functions exactly wrap the antlr3 source inconsistencies.
51 // These are the only location CVC4_ANTLR3_OLD_INPUT_STREAM ifdefs appear.
52 // No other sanity checking happens;
53 pANTLR3_INPUT_STREAM newAntlr3BufferedStream(std::istream& input, const std::string& name);
54 pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name);
55 pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
56 uint32_t size,
57 const std::string& name);
58
59 pANTLR3_INPUT_STREAM
60 newAntlr3BufferedStream(std::istream& input, const std::string& name){
61 pANTLR3_INPUT_STREAM inputStream = NULL;
62 pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
63
64 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
65 inputStream =
66 antlr3LineBufferedStreamNew(input, 0, name_duplicate);
67 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
68 inputStream =
69 antlr3LineBufferedStreamNew(input, ANTLR3_ENC_8BIT, name_duplicate);
70 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
71
72 free(name_duplicate);
73 return inputStream;
74 }
75
76 pANTLR3_INPUT_STREAM newAntlr3FileStream(const std::string& name){
77 pANTLR3_INPUT_STREAM input = NULL;
78 pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
79
80 // libantlr3c v3.2 isn't source-compatible with v3.4
81 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
82 input = antlr3AsciiFileStreamNew(name_duplicate);
83 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
84 input = antlr3FileStreamNew(name_duplicate, ANTLR3_ENC_8BIT);
85 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
86
87 free(name_duplicate);
88 return input;
89 }
90
91
92 pANTLR3_INPUT_STREAM newAntrl3InPlaceStream(pANTLR3_UINT8 basep,
93 uint32_t size,
94 const std::string& name){
95 pANTLR3_UINT8 name_duplicate = (pANTLR3_UINT8) strdup(name.c_str());
96 pANTLR3_INPUT_STREAM inputStream = NULL;
97 /* Create an ANTLR input backed by the buffer. */
98 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
99 inputStream =
100 antlr3NewAsciiStringInPlaceStream(basep, size, name_duplicate);
101 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
102 inputStream =
103 antlr3StringStreamNew(basep, ANTLR3_ENC_8BIT, size,
104 name_duplicate);
105 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
106 free(name_duplicate);
107 return inputStream;
108 }
109
110 AntlrInputStream::AntlrInputStream(std::string name,
111 pANTLR3_INPUT_STREAM input,
112 bool fileIsTemporary,
113 pANTLR3_UINT8 inputString) :
114 InputStream(name, fileIsTemporary),
115 d_input(input),
116 d_inputString(inputString)
117 {
118 assert( input != NULL );
119 input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
120 }
121
122 AntlrInputStream::~AntlrInputStream() {
123 d_input->free(d_input);
124 if(d_inputString != NULL){
125 free(d_inputString);
126 }
127 }
128
129 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
130 return d_input;
131 }
132
133
134
135 AntlrInputStream*
136 AntlrInputStream::newFileInputStream(const std::string& name,
137 bool useMmap)
138 throw (InputStreamException) {
139 #ifdef _WIN32
140 if(useMmap) {
141 useMmap = false;
142 }
143 #endif
144 pANTLR3_INPUT_STREAM input = NULL;
145 if(useMmap) {
146 input = MemoryMappedInputBufferNew(name);
147 } else {
148 input = newAntlr3FileStream(name);
149 }
150 if(input == NULL) {
151 throw InputStreamException("Couldn't open file: " + name);
152 }
153 return new AntlrInputStream( name, input, false, NULL );
154 }
155
156
157 AntlrInputStream*
158 AntlrInputStream::newStreamInputStream(std::istream& input,
159 const std::string& name,
160 bool lineBuffered)
161 throw (InputStreamException) {
162
163 pANTLR3_INPUT_STREAM inputStream = NULL;
164 pANTLR3_UINT8 inputStringCopy = NULL;
165
166 if(lineBuffered) {
167 inputStream = newAntlr3BufferedStream(input, name);
168 } else {
169
170 // Since these are all NULL on entry, realloc will be called
171 char *basep = NULL, *boundp = NULL, *cp = NULL;
172 /* 64KB seems like a reasonable default size. */
173 size_t bufSize = 0x10000;
174
175 /* Keep going until we can't go no more. */
176 while( !input.eof() && !input.fail() ) {
177
178 if( cp == boundp ) {
179 /* We ran out of room in the buffer. Realloc at double the size. */
180 ptrdiff_t offset = cp - basep;
181 basep = (char *) realloc(basep, bufSize);
182 if( basep == NULL ) {
183 throw InputStreamException("Failed buffering input stream: " + name);
184 }
185 cp = basep + offset;
186 boundp = basep + bufSize;
187 bufSize *= 2;
188 }
189
190 /* Read as much as we have room for. */
191 input.read( cp, boundp - cp );
192 cp += input.gcount();
193 }
194
195 /* Make sure the fail bit didn't get set. */
196 if( !input.eof() ) {
197 throw InputStreamException("Stream input failed: " + name);
198 }
199 ptrdiff_t offset = cp - basep;
200 assert(offset >= 0);
201 assert(offset <= std::numeric_limits<uint32_t>::max());
202 inputStringCopy = (pANTLR3_UINT8)basep;
203 inputStream = newAntrl3InPlaceStream(inputStringCopy, (uint32_t) offset, name);
204 }
205
206 if( inputStream == NULL ) {
207 throw InputStreamException("Couldn't initialize input: " + name);
208 }
209
210 return new AntlrInputStream( name, inputStream, false, inputStringCopy );
211 }
212
213
214 AntlrInputStream*
215 AntlrInputStream::newStringInputStream(const std::string& input,
216 const std::string& name)
217 throw (InputStreamException) {
218
219 size_t input_size = input.size();
220 assert(input_size <= std::numeric_limits<uint32_t>::max());
221
222 // Ownership of input_duplicate is transferred to the AntlrInputStream.
223 pANTLR3_UINT8 input_duplicate = (pANTLR3_UINT8) strdup(input.c_str());
224
225 if( input_duplicate == NULL ) {
226 throw InputStreamException("Couldn't initialize string input: '" + input + "'");
227 }
228
229 pANTLR3_INPUT_STREAM inputStream = newAntrl3InPlaceStream(input_duplicate, (uint32_t)input_size, name);
230 if( inputStream==NULL ) {
231 throw InputStreamException("Couldn't initialize string input: '" + input + "'");
232 }
233 return new AntlrInputStream( name, inputStream, false, input_duplicate );
234 }
235
236 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
237 using namespace language::input;
238
239 AntlrInput* input;
240
241 switch(lang) {
242 case LANG_CVC4: {
243 input = new CvcInput(inputStream);
244 break;
245 }
246 case LANG_SMTLIB_V1:
247 input = new Smt1Input(inputStream);
248 break;
249
250 case LANG_SMTLIB_V2_0:
251 case LANG_SMTLIB_V2_5:
252 input = new Smt2Input(inputStream, lang);
253 break;
254
255 case LANG_SYGUS:
256 input = new SygusInput(inputStream);
257 break;
258
259 case LANG_TPTP:
260 input = new TptpInput(inputStream);
261 break;
262
263 default:
264 std::stringstream ss;
265 ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
266 throw InputStreamException(ss.str());
267 }
268
269 return input;
270 }
271
272 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
273 Input(inputStream),
274 d_lookahead(lookahead),
275 d_lexer(NULL),
276 d_parser(NULL),
277 d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
278 d_tokenBuffer(NULL) {
279 }
280
281 /*
282 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
283 Parser(exprManager,name),
284 d_lookahead(lookahead) {
285
286 }
287 */
288
289 /*
290 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
291 Input(exprManager,name),
292 d_lookahead(lookahead),
293 d_lexer(NULL),
294 d_parser(NULL),
295 d_tokenStream(NULL) {
296
297 char* inputStr = strdup(input.c_str());
298 char* nameStr = strdup(name.c_str());
299 if( inputStr==NULL || nameStr==NULL ) {
300 throw ParserException("Couldn't initialize string input: '" + input + "'");
301 }
302 d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
303 if( d_inputStream == NULL ) {
304 throw ParserException("Couldn't create input stream for string: '" + input + "'");
305 }
306
307 }
308 */
309
310 AntlrInput::~AntlrInput() {
311 BoundedTokenBufferFree(d_tokenBuffer);
312 }
313
314 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
315 return d_tokenBuffer->commonTstream;
316 }
317
318 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
319 pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
320 assert(lexer!=NULL);
321 Parser *parser = (Parser*)(lexer->super);
322 assert(parser!=NULL);
323 AntlrInput *input = (AntlrInput*) parser->getInput();
324 assert(input!=NULL);
325
326 /* Call the error display routine *if* there's not already a
327 * parse error pending. If a parser error is pending, this
328 * error is probably less important, so we just drop it. */
329 if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
330 input->parseError("Error finding next token.");
331 }
332 }
333
334 void AntlrInput::warning(const std::string& message) {
335 Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
336 }
337
338
339
340 /**
341 * characters considered part of a simple symbol in SMTLIB.
342 *
343 * TODO: Ideally this code shouldn't be smtlib specific (should work
344 * with CVC language too), but this per-language specialization has
345 * been left for a later point.
346 */
347 inline bool isSimpleChar(char ch) {
348 return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
349 }
350
351 size_t wholeWordMatch(string input, string pattern, bool (*isWordChar)(char)) {
352 size_t st = 0;
353 size_t N = input.size();
354 while(st < N) {
355 while( st < N && (*isWordChar)(input[st]) == false ) st++;
356 size_t en = st;
357 while(en + 1 < N && (*isWordChar)(input[en + 1]) == true) en++;
358 if(en - st + 1 == pattern.size()) {
359 bool match = true;
360 for(size_t i = 0; match && i < pattern.size(); ++i) {
361 match &= (pattern[i] == input[st+i]);
362 }
363 if(match == true) {
364 return st;
365 }
366 }
367 st = en + 1;
368 }
369 return string::npos;
370 }
371
372 /**
373 * Gets part of original input and tries to visually hint where the
374 * error might be.
375 *
376 * Something like:
377 *
378 * ...nd (= alpha beta) (= beta delta))
379 * ^
380 *
381 * Implementation (as on 2014/04/24):
382 *
383 * > if suggested pointer by lexer is under a "simple char", move to
384 * start of the word and print pointer there.
385 *
386 * > in the other case, it tries to find the nearest word in the error
387 * message passed along. if it can't find it, we don't add this
388 * visual hint, as experimentally position suggested by lexer was
389 * found to be totally unhelpful. (TODO: fix this upstream to
390 * improve)
391 */
392 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
393 {
394 // Is it a multi-line message
395 bool multilineMessage = (message.find('\n') != string::npos);
396 // Useful only if it is a multi-line message
397 int firstLineEnd = message.find('\n');
398
399 std::ostringstream ss, slicess;
400
401 // Keep first line of message
402 if(multilineMessage) {
403 ss << message.substr(0, firstLineEnd) << endl << endl;
404 } else {
405 ss << message << endl << endl;
406 }
407
408 int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
409 int posSliceEnd = posSliceStart + 70;
410 int caretPos = 0;
411 int caretPosExtra = 0; // for inital intendation, epilipses etc.
412
413 ss << " "; caretPosExtra += 2;
414 if(posSliceStart > 0) {
415 ss << "..."; caretPosExtra += 3;
416 }
417
418 for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
419 if(i == posSliceEnd) {
420 ss << "...";
421 break;
422 }
423 if(i < charPositionInLine) { caretPos++; }
424
425 if(!isprint(lineStart[i])) {
426 // non-printable character, something wrong, bail out
427 return message;
428 }
429
430 ss << (lineStart[i]);
431 slicess << (lineStart[i]);
432 }
433
434 // adjust position of caret, based on slice and message
435 {
436 int caretPosOrig = caretPos;
437 string slice = slicess.str();
438 if(isSimpleChar(slice[caretPos])) {
439 // if alphanumeric, try to go to beginning of word/number
440 while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
441 if(caretPos == 0 && posSliceStart > 0) {
442 // reached start and this is not really the start? bail out
443 return message;
444 } else {
445 // likely it is also in original message? if so, very likely
446 // we found the right place
447 string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
448 size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
449 Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
450 if( matchLoc != string::npos ) {
451 Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
452 }
453 }
454 } else {
455 bool foundCaretPos = false;
456
457 for(int tries = 0; tries < 2 && caretPos > 0 && !foundCaretPos; ++tries) {
458 // go to nearest alphanumeric string (before current position),
459 // see if that word can be found in original message. If so,
460 // point to that, else keep pointer where it was.
461 int nearestWordEn = caretPos - 1;
462 while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
463 --nearestWordEn;
464 }
465 if(isSimpleChar(slice[nearestWordEn])) {
466 int nearestWordSt = nearestWordEn;
467 while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
468 --nearestWordSt;
469 }
470 string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
471 size_t matchLoc = wholeWordMatch(message, word, isSimpleChar);
472 Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
473 Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc << endl;
474 if( matchLoc != string::npos ) {
475 Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
476 << nearestWordSt << std::endl;
477 foundCaretPos = true;
478 }
479 caretPos = nearestWordSt;
480 }
481 }
482 if( !foundCaretPos) {
483 // this doesn't look good. caret generally getting printed
484 // at unhelpful positions. improve upstream?
485 return message;
486 }
487 }
488 caretPos += caretPosExtra;
489 }// end of caret position computation/heuristics
490
491 ss << std::endl;
492 while( caretPos-- > 0 ) {
493 ss << ' ';
494 }
495 ss << '^' << endl;
496 if(multilineMessage) {
497 ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
498 }
499 return ss.str();
500 }
501
502 void AntlrInput::parseError(const std::string& message, bool eofException)
503 throw (ParserException) {
504
505 string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
506 d_lexer->getCharPositionInLine(d_lexer),
507 message);
508
509 Debug("parser") << "Throwing exception: "
510 << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
511 << d_lexer->getLine(d_lexer) << "."
512 << d_lexer->getCharPositionInLine(d_lexer) << ": "
513 << updatedMessage << endl;
514 if(eofException) {
515 throw ParserEndOfFileException(message,
516 (const char*)d_lexer->rec->state->tokSource->fileName->chars,
517 d_lexer->getLine(d_lexer),
518 d_lexer->getCharPositionInLine(d_lexer));
519 } else {
520 throw ParserException(updatedMessage,
521 (const char*)d_lexer->rec->state->tokSource->fileName->chars,
522 d_lexer->getLine(d_lexer),
523 d_lexer->getCharPositionInLine(d_lexer));
524 }
525 }
526
527
528 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
529 d_lexer = pLexer;
530
531 pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
532 if( pTokenFactory != NULL ) {
533 pTokenFactory->close(pTokenFactory);
534 }
535
536 /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
537 pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
538 if( pTokenFactory == NULL ) {
539 throw InputStreamException("Couldn't create token factory.");
540 }
541 d_lexer->rec->state->tokFactory = pTokenFactory;
542
543 pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
544 if( buffer == NULL ) {
545 throw InputStreamException("Couldn't create token buffer.");
546 }
547
548 d_tokenBuffer = buffer;
549
550 // Override default lexer error reporting
551 d_lexer->rec->reportError = &lexerError;
552 // Override default nextToken function, just to prevent exceptions escaping.
553 d_lexer->rec->state->tokSource->nextToken = &nextToken;
554 }
555
556 void AntlrInput::setParser(Parser& parser) {
557 // ANTLR isn't using super in the lexer or the parser, AFAICT.
558 // We could also use @lexer/parser::context to add a field to the generated
559 // objects, but then it would have to be declared separately in every
560 // language's grammar and we'd have to in the address of the field anyway.
561 d_lexer->super = &parser;
562 d_parser->super = &parser;
563 }
564
565 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
566 d_parser = pParser;
567 // d_parser->rec->match = &match;
568 d_parser->rec->reportError = &reportError;
569 /* Don't try to recover from a parse error. */
570 // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
571 d_parser->rec->recoverFromMismatchedToken =
572 (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
573 d_parser->rec->mismatch;
574 }
575
576 }/* CVC4::parser namespace */
577 }/* CVC4 namespace */