fix a typo: --string-exp => --strings-exp; fix a signed int warning in antlr
[cvc5.git] / src / parser / antlr_input.cpp
1 /********************* */
2 /*! \file antlr_input.cpp
3 ** \verbatim
4 ** Original author: Christopher L. Conway
5 ** Major contributors: Morgan Deters
6 ** Minor contributors (to current version): Francois Bobot, Kshitij Bansal
7 ** This file is part of the CVC4 project.
8 ** Copyright (c) 2009-2014 New York University and The University of Iowa
9 ** See the file COPYING in the top-level source directory for licensing
10 ** information.\endverbatim
11 **
12 ** \brief A super-class for ANTLR-generated input language parsers.
13 **
14 ** A super-class for ANTLR-generated input language parsers
15 **/
16
17 #include <limits.h>
18 #include <antlr3.h>
19 #include <stdint.h>
20
21 #include "parser/antlr_input.h"
22 #include "parser/input.h"
23 #include "parser/bounded_token_buffer.h"
24 #include "parser/bounded_token_factory.h"
25 #include "parser/antlr_line_buffered_input.h"
26 #include "parser/memory_mapped_input_buffer.h"
27 #include "parser/parser_exception.h"
28 #include "parser/parser.h"
29
30 #include "expr/command.h"
31 #include "expr/type.h"
32 #include "parser/cvc/cvc_input.h"
33 #include "parser/smt1/smt1_input.h"
34 #include "parser/smt2/smt2_input.h"
35 #include "parser/tptp/tptp_input.h"
36 #include "util/output.h"
37
38 using namespace std;
39 using namespace CVC4;
40 using namespace CVC4::parser;
41 using namespace CVC4::kind;
42
43 namespace CVC4 {
44 namespace parser {
45
46 AntlrInputStream::AntlrInputStream(std::string name,
47 pANTLR3_INPUT_STREAM input,
48 bool fileIsTemporary) :
49 InputStream(name, fileIsTemporary),
50 d_input(input) {
51 assert( input != NULL );
52 input->fileName = input->strFactory->newStr8(input->strFactory, (pANTLR3_UINT8)name.c_str());
53 }
54
55 AntlrInputStream::~AntlrInputStream() {
56 d_input->free(d_input);
57 }
58
59 pANTLR3_INPUT_STREAM AntlrInputStream::getAntlr3InputStream() const {
60 return d_input;
61 }
62
63 AntlrInputStream*
64 AntlrInputStream::newFileInputStream(const std::string& name,
65 bool useMmap)
66 throw (InputStreamException) {
67 #ifdef _WIN32
68 if(useMmap) {
69 useMmap = false;
70 }
71 #endif
72 pANTLR3_INPUT_STREAM input = NULL;
73 if(useMmap) {
74 input = MemoryMappedInputBufferNew(name);
75 } else {
76 // libantlr3c v3.2 isn't source-compatible with v3.4
77 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
78 input = antlr3AsciiFileStreamNew((pANTLR3_UINT8) name.c_str());
79 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
80 input = antlr3FileStreamNew((pANTLR3_UINT8) name.c_str(), ANTLR3_ENC_8BIT);
81 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
82 }
83 if(input == NULL) {
84 throw InputStreamException("Couldn't open file: " + name);
85 }
86 return new AntlrInputStream( name, input );
87 }
88
89 AntlrInputStream*
90 AntlrInputStream::newStreamInputStream(std::istream& input,
91 const std::string& name,
92 bool lineBuffered)
93 throw (InputStreamException) {
94
95 pANTLR3_INPUT_STREAM inputStream = NULL;
96
97 if(lineBuffered) {
98 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
99 inputStream =
100 antlr3LineBufferedStreamNew(input,
101 0,
102 (pANTLR3_UINT8) strdup(name.c_str()));
103 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
104 inputStream =
105 antlr3LineBufferedStreamNew(input,
106 ANTLR3_ENC_8BIT,
107 (pANTLR3_UINT8) strdup(name.c_str()));
108 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
109 } else {
110
111 // Since these are all NULL on entry, realloc will be called
112 char *basep = NULL, *boundp = NULL, *cp = NULL;
113 /* 64KB seems like a reasonable default size. */
114 size_t bufSize = 0x10000;
115
116 /* Keep going until we can't go no more. */
117 while( !input.eof() && !input.fail() ) {
118
119 if( cp == boundp ) {
120 /* We ran out of room in the buffer. Realloc at double the size. */
121 ptrdiff_t offset = cp - basep;
122 basep = (char *) realloc(basep, bufSize);
123 if( basep == NULL ) {
124 throw InputStreamException("Failed buffering input stream: " + name);
125 }
126 cp = basep + offset;
127 boundp = basep + bufSize;
128 bufSize *= 2;
129 }
130
131 /* Read as much as we have room for. */
132 input.read( cp, boundp - cp );
133 cp += input.gcount();
134 }
135
136 /* Make sure the fail bit didn't get set. */
137 if( !input.eof() ) {
138 throw InputStreamException("Stream input failed: " + name);
139 }
140
141 /* Create an ANTLR input backed by the buffer. */
142 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
143 inputStream =
144 antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) basep,
145 cp - basep,
146 (pANTLR3_UINT8) strdup(name.c_str()));
147 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
148 inputStream =
149 antlr3StringStreamNew((pANTLR3_UINT8) basep,
150 ANTLR3_ENC_8BIT,
151 cp - basep,
152 (pANTLR3_UINT8) strdup(name.c_str()));
153 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
154
155 }
156
157 if( inputStream == NULL ) {
158 throw InputStreamException("Couldn't initialize input: " + name);
159 }
160
161 return new AntlrInputStream( name, inputStream );
162 }
163
164
165 AntlrInputStream*
166 AntlrInputStream::newStringInputStream(const std::string& input,
167 const std::string& name)
168 throw (InputStreamException) {
169 char* inputStr = strdup(input.c_str());
170 char* nameStr = strdup(name.c_str());
171 assert( inputStr!=NULL && nameStr!=NULL );
172 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
173 pANTLR3_INPUT_STREAM inputStream =
174 antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8) inputStr,
175 input.size(),
176 (pANTLR3_UINT8) nameStr);
177 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
178 pANTLR3_INPUT_STREAM inputStream =
179 antlr3StringStreamNew((pANTLR3_UINT8) inputStr,
180 ANTLR3_ENC_8BIT,
181 input.size(),
182 (pANTLR3_UINT8) nameStr);
183 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
184 if( inputStream==NULL ) {
185 throw InputStreamException("Couldn't initialize string input: '" + input + "'");
186 }
187 return new AntlrInputStream( name, inputStream );
188 }
189
190 AntlrInput* AntlrInput::newInput(InputLanguage lang, AntlrInputStream& inputStream) {
191 using namespace language::input;
192
193 AntlrInput* input;
194
195 switch(lang) {
196 case LANG_CVC4: {
197 input = new CvcInput(inputStream);
198 break;
199 }
200 case LANG_SMTLIB_V1:
201 input = new Smt1Input(inputStream);
202 break;
203
204 case LANG_SMTLIB_V2:
205 input = new Smt2Input(inputStream);
206 break;
207
208 case LANG_TPTP:
209 input = new TptpInput(inputStream);
210 break;
211
212 default:
213 std::stringstream ss;
214 ss << "internal error: unhandled language " << lang << " in AntlrInput::newInput";
215 throw InputStreamException(ss.str());
216 }
217
218 return input;
219 }
220
221 AntlrInput::AntlrInput(AntlrInputStream& inputStream, unsigned int lookahead) :
222 Input(inputStream),
223 d_lookahead(lookahead),
224 d_lexer(NULL),
225 d_parser(NULL),
226 d_antlr3InputStream( inputStream.getAntlr3InputStream() ),
227 d_tokenBuffer(NULL) {
228 }
229
230 /*
231 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
232 Parser(exprManager,name),
233 d_lookahead(lookahead) {
234
235 }
236 */
237
238 /*
239 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
240 Input(exprManager,name),
241 d_lookahead(lookahead),
242 d_lexer(NULL),
243 d_parser(NULL),
244 d_tokenStream(NULL) {
245
246 char* inputStr = strdup(input.c_str());
247 char* nameStr = strdup(name.c_str());
248 if( inputStr==NULL || nameStr==NULL ) {
249 throw ParserException("Couldn't initialize string input: '" + input + "'");
250 }
251 d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
252 if( d_inputStream == NULL ) {
253 throw ParserException("Couldn't create input stream for string: '" + input + "'");
254 }
255
256 }
257 */
258
259 AntlrInput::~AntlrInput() {
260 BoundedTokenBufferFree(d_tokenBuffer);
261 }
262
263 pANTLR3_COMMON_TOKEN_STREAM AntlrInput::getTokenStream() {
264 return d_tokenBuffer->commonTstream;
265 }
266
267 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer) {
268 pANTLR3_LEXER lexer = (pANTLR3_LEXER)(recognizer->super);
269 assert(lexer!=NULL);
270 Parser *parser = (Parser*)(lexer->super);
271 assert(parser!=NULL);
272 AntlrInput *input = (AntlrInput*) parser->getInput();
273 assert(input!=NULL);
274
275 /* Call the error display routine *if* there's not already a
276 * parse error pending. If a parser error is pending, this
277 * error is probably less important, so we just drop it. */
278 if( input->d_parser->rec->state->error == ANTLR3_FALSE ) {
279 input->parseError("Error finding next token.");
280 }
281 }
282
283 void AntlrInput::warning(const std::string& message) {
284 Warning() << getInputStream()->getName() << ':' << d_lexer->getLine(d_lexer) << '.' << d_lexer->getCharPositionInLine(d_lexer) << ": " << message << endl;
285 }
286
287
288 /**
289 * characters considered part of a simple symbol in SMTLIB.
290 *
291 * TODO: Ideally this code shouldn't be smtlib specific (should work
292 * with CVC language too), but this per-language specialization has
293 * been left for a later point.
294 */
295 inline bool isSimpleChar(char ch) {
296 return isalnum(ch) || (strchr("~!@$%^&*_-+=<>.?/", ch) != NULL);
297 }
298
299 /**
300 * Gets part of original input and tries to visually hint where the
301 * error might be.
302 *
303 * Something like:
304 *
305 * ...nd (= alpha beta) (= beta delta))
306 * ^
307 *
308 * Implementation (as on 2014/04/24):
309 *
310 * > if suggested pointer by lexer is under a "simple char", move to
311 * start of the word and print pointer there.
312 *
313 * > in the other case, it tries to find the nearest word in the error
314 * message passed along. if it can't find it, we don't add this
315 * visual hint, as experimentally position suggested by lexer was
316 * found to be totally unhelpful. (TODO: fix this upstream to
317 * improve)
318 */
319 std::string parseErrorHelper(const char* lineStart, int charPositionInLine, const std::string& message)
320 {
321 // Is it a multi-line message
322 bool multilineMessage = (message.find('\n') != string::npos);
323 // Useful only if it is a multi-line message
324 int firstLineEnd = message.find('\n');
325
326 std::ostringstream ss, slicess;
327
328 // Keep first line of message
329 if(multilineMessage) {
330 ss << message.substr(0, firstLineEnd) << endl << endl;
331 } else {
332 ss << message << endl << endl;
333 }
334
335 int posSliceStart = (charPositionInLine - 50 <= 0) ? 0 : charPositionInLine - 50 + 5;
336 int posSliceEnd = posSliceStart + 70;
337 int caretPos = 0;
338 int caretPosExtra = 0; // for inital intendation, epilipses etc.
339
340 ss << " "; caretPosExtra += 2;
341 if(posSliceStart > 0) {
342 ss << "..."; caretPosExtra += 3;
343 }
344
345 for(int i = posSliceStart; lineStart[i] != '\n'; ++i) {
346 if(i == posSliceEnd) {
347 ss << "...";
348 break;
349 }
350 if(i < charPositionInLine) { caretPos++; }
351
352 if(!isprint(lineStart[i])) {
353 // non-printable character, something wrong, bail out
354 return message;
355 }
356
357 ss << (lineStart[i]);
358 slicess << (lineStart[i]);
359 }
360
361 // adjust position of caret, based on slice and message
362 {
363 int caretPosOrig = caretPos;
364 string slice = slicess.str();
365 if(isSimpleChar(slice[caretPos])) {
366 // if alphanumeric, try to go to beginning of word/number
367 while(caretPos > 0 && isSimpleChar(slice[caretPos - 1])) { --caretPos; }
368 if(caretPos == 0 && posSliceStart > 0) {
369 // reached start and this is not really the start? bail out
370 return message;
371 } else {
372 // likely it is also in original message? if so, very likely
373 // we found the right place
374 string word = slice.substr(caretPos, (caretPosOrig - caretPos + 1));
375 unsigned messagePosSt = message.find(word);
376 unsigned messagePosEn = messagePosSt + (caretPosOrig - caretPos);
377 if( messagePosSt < string::npos &&
378 (messagePosSt == 0 || !isSimpleChar(message[messagePosSt-1]) ) &&
379 (messagePosEn+1 == message.size() || !isSimpleChar(message[messagePosEn+1]) ) ) {
380 // ^the complicated if statement is just 'whole-word' match
381 Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl;
382 }
383 }
384 } else {
385 // go to nearest alphanumeric string (before current position),
386 // see if that word can be found in original message. If so,
387 // point to that, else keep pointer where it was.
388 int nearestWordEn = caretPos - 1;
389 while(nearestWordEn > 0 && !isSimpleChar(slice[nearestWordEn])) {
390 --nearestWordEn;
391 }
392 if(isSimpleChar(slice[nearestWordEn])) {
393 int nearestWordSt = nearestWordEn;
394 while(nearestWordSt > 0 && isSimpleChar(slice[nearestWordSt - 1])) {
395 --nearestWordSt;
396 }
397 string word = slice.substr(nearestWordSt, (nearestWordEn - nearestWordSt + 1));
398 Debug("friendlyparser") << "[friendlyparser] nearest word = " << word << std::endl;
399 unsigned messagePosSt = message.find(word);
400 unsigned messagePosEn = messagePosSt + (nearestWordEn - nearestWordSt + 1);
401 if( messagePosSt < string::npos &&
402 (messagePosSt == 0 || !isSimpleChar(message[messagePosSt-1]) ) &&
403 (messagePosEn+1 == message.size() || !isSimpleChar(message[messagePosEn+1]) ) ) {
404 // ^the complicated if statement is just 'whole-word' match
405 Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
406 << nearestWordSt << std::endl;
407 caretPos = nearestWordSt;
408 } else {
409 // this doesn't look good. caret generally getting printed
410 // at unhelpful positions. improve upstream?
411 return message;
412 }
413 }
414 }
415 caretPos += caretPosExtra;
416 }// end of caret position computation/heuristics
417
418 ss << std::endl;
419 while( caretPos-- > 0 ) {
420 ss << ' ';
421 }
422 ss << '^' << endl;
423 if(multilineMessage) {
424 ss << message.substr(firstLineEnd, message.size() - firstLineEnd);;
425 }
426 return ss.str();
427 }
428
429 void AntlrInput::parseError(const std::string& message, bool eofException)
430 throw (ParserException) {
431
432 string updatedMessage = parseErrorHelper((const char*)d_antlr3InputStream->getLineBuf(d_antlr3InputStream),
433 d_lexer->getCharPositionInLine(d_lexer),
434 message);
435
436 Debug("parser") << "Throwing exception: "
437 << (const char*)d_lexer->rec->state->tokSource->fileName->chars << ":"
438 << d_lexer->getLine(d_lexer) << "."
439 << d_lexer->getCharPositionInLine(d_lexer) << ": "
440 << updatedMessage << endl;
441 if(eofException) {
442 throw ParserEndOfFileException(message,
443 (const char*)d_lexer->rec->state->tokSource->fileName->chars,
444 d_lexer->getLine(d_lexer),
445 d_lexer->getCharPositionInLine(d_lexer));
446 } else {
447 throw ParserException(updatedMessage,
448 (const char*)d_lexer->rec->state->tokSource->fileName->chars,
449 d_lexer->getLine(d_lexer),
450 d_lexer->getCharPositionInLine(d_lexer));
451 }
452 }
453
454
455 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer) {
456 d_lexer = pLexer;
457
458 pANTLR3_TOKEN_FACTORY pTokenFactory = d_lexer->rec->state->tokFactory;
459 if( pTokenFactory != NULL ) {
460 pTokenFactory->close(pTokenFactory);
461 }
462
463 /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
464 pTokenFactory = BoundedTokenFactoryNew(d_antlr3InputStream, 2*d_lookahead);
465 if( pTokenFactory == NULL ) {
466 throw InputStreamException("Couldn't create token factory.");
467 }
468 d_lexer->rec->state->tokFactory = pTokenFactory;
469
470 pBOUNDED_TOKEN_BUFFER buffer = BoundedTokenBufferSourceNew(d_lookahead, d_lexer->rec->state->tokSource);
471 if( buffer == NULL ) {
472 throw InputStreamException("Couldn't create token buffer.");
473 }
474
475 d_tokenBuffer = buffer;
476
477 // Override default lexer error reporting
478 d_lexer->rec->reportError = &lexerError;
479 // Override default nextToken function, just to prevent exceptions escaping.
480 d_lexer->rec->state->tokSource->nextToken = &nextToken;
481 }
482
483 void AntlrInput::setParser(Parser& parser) {
484 // ANTLR isn't using super in the lexer or the parser, AFAICT.
485 // We could also use @lexer/parser::context to add a field to the generated
486 // objects, but then it would have to be declared separately in every
487 // language's grammar and we'd have to in the address of the field anyway.
488 d_lexer->super = &parser;
489 d_parser->super = &parser;
490 }
491
492 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser) {
493 d_parser = pParser;
494 // d_parser->rec->match = &match;
495 d_parser->rec->reportError = &reportError;
496 /* Don't try to recover from a parse error. */
497 // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
498 d_parser->rec->recoverFromMismatchedToken =
499 (void* (*)(ANTLR3_BASE_RECOGNIZER_struct*, ANTLR3_UINT32, ANTLR3_BITSET_LIST_struct*))
500 d_parser->rec->mismatch;
501 }
502
503 }/* CVC4::parser namespace */
504 }/* CVC4 namespace */