1 /********************* */
2 /*! \file antlr_input.cpp
4 ** Original author: Christopher L. Conway
5 ** Major contributors: Morgan Deters, Kshitij Bansal
6 ** Minor contributors (to current version): Francois Bobot
7 ** This file is part of the CVC4 project.
8 ** Copyright (c) 2009-2014 New York University and The University of Iowa
9 ** See the file COPYING in the top-level source directory for licensing
10 ** information.\endverbatim
12 ** \brief A super-class for ANTLR-generated input language parsers.
14 ** A super-class for ANTLR-generated input language parsers
21 #include "base/output.h"
22 #include "expr/type.h"
23 #include "parser/antlr_input.h"
24 #include "parser/antlr_line_buffered_input.h"
25 #include "parser/bounded_token_buffer.h"
26 #include "parser/bounded_token_factory.h"
27 #include "parser/cvc/cvc_input.h"
28 #include "parser/input.h"
29 #include "parser/memory_mapped_input_buffer.h"
30 #include "parser/parser.h"
31 #include "parser/parser_exception.h"
32 #include "parser/smt1/smt1_input.h"
33 #include "parser/smt2/smt2_input.h"
34 #include "parser/smt2/sygus_input.h"
35 #include "parser/tptp/tptp_input.h"
36 #include "smt_util/command.h"
40 using namespace CVC4::parser
;
41 using namespace CVC4::kind
;
46 // These functions exactly wrap the antlr3 source inconsistencies.
47 // These are the only location CVC4_ANTLR3_OLD_INPUT_STREAM ifdefs appear.
48 // No other sanity checking happens;
49 pANTLR3_INPUT_STREAM
newAntlr3BufferedStream(std::istream
& input
, const std::string
& name
);
50 pANTLR3_INPUT_STREAM
newAntlr3FileStream(const std::string
& name
);
51 pANTLR3_INPUT_STREAM
newAntrl3InPlaceStream(pANTLR3_UINT8 basep
,
53 const std::string
& name
);
56 newAntlr3BufferedStream(std::istream
& input
, const std::string
& name
){
57 pANTLR3_INPUT_STREAM inputStream
= NULL
;
58 pANTLR3_UINT8 name_duplicate
= (pANTLR3_UINT8
) strdup(name
.c_str());
60 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
62 antlr3LineBufferedStreamNew(input
, 0, name_duplicate
);
63 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
65 antlr3LineBufferedStreamNew(input
, ANTLR3_ENC_8BIT
, name_duplicate
);
66 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
72 pANTLR3_INPUT_STREAM
newAntlr3FileStream(const std::string
& name
){
73 pANTLR3_INPUT_STREAM input
= NULL
;
74 pANTLR3_UINT8 name_duplicate
= (pANTLR3_UINT8
) strdup(name
.c_str());
76 // libantlr3c v3.2 isn't source-compatible with v3.4
77 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
78 input
= antlr3AsciiFileStreamNew(name_duplicate
);
79 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
80 input
= antlr3FileStreamNew(name_duplicate
, ANTLR3_ENC_8BIT
);
81 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
88 pANTLR3_INPUT_STREAM
newAntrl3InPlaceStream(pANTLR3_UINT8 basep
,
90 const std::string
& name
){
91 pANTLR3_UINT8 name_duplicate
= (pANTLR3_UINT8
) strdup(name
.c_str());
92 pANTLR3_INPUT_STREAM inputStream
= NULL
;
93 /* Create an ANTLR input backed by the buffer. */
94 #ifdef CVC4_ANTLR3_OLD_INPUT_STREAM
96 antlr3NewAsciiStringInPlaceStream(basep
, size
, name_duplicate
);
97 #else /* CVC4_ANTLR3_OLD_INPUT_STREAM */
99 antlr3StringStreamNew(basep
, ANTLR3_ENC_8BIT
, size
,
101 #endif /* CVC4_ANTLR3_OLD_INPUT_STREAM */
102 free(name_duplicate
);
106 AntlrInputStream::AntlrInputStream(std::string name
,
107 pANTLR3_INPUT_STREAM input
,
108 bool fileIsTemporary
,
109 pANTLR3_UINT8 inputString
) :
110 InputStream(name
, fileIsTemporary
),
112 d_inputString(inputString
)
114 assert( input
!= NULL
);
115 input
->fileName
= input
->strFactory
->newStr8(input
->strFactory
, (pANTLR3_UINT8
)name
.c_str());
118 AntlrInputStream::~AntlrInputStream() {
119 d_input
->free(d_input
);
120 if(d_inputString
!= NULL
){
125 pANTLR3_INPUT_STREAM
AntlrInputStream::getAntlr3InputStream() const {
132 AntlrInputStream::newFileInputStream(const std::string
& name
,
134 throw (InputStreamException
) {
140 pANTLR3_INPUT_STREAM input
= NULL
;
142 input
= MemoryMappedInputBufferNew(name
);
144 input
= newAntlr3FileStream(name
);
147 throw InputStreamException("Couldn't open file: " + name
);
149 return new AntlrInputStream( name
, input
, false, NULL
);
154 AntlrInputStream::newStreamInputStream(std::istream
& input
,
155 const std::string
& name
,
157 throw (InputStreamException
) {
159 pANTLR3_INPUT_STREAM inputStream
= NULL
;
160 pANTLR3_UINT8 inputStringCopy
= NULL
;
163 inputStream
= newAntlr3BufferedStream(input
, name
);
166 // Since these are all NULL on entry, realloc will be called
167 char *basep
= NULL
, *boundp
= NULL
, *cp
= NULL
;
168 /* 64KB seems like a reasonable default size. */
169 size_t bufSize
= 0x10000;
171 /* Keep going until we can't go no more. */
172 while( !input
.eof() && !input
.fail() ) {
175 /* We ran out of room in the buffer. Realloc at double the size. */
176 ptrdiff_t offset
= cp
- basep
;
177 basep
= (char *) realloc(basep
, bufSize
);
178 if( basep
== NULL
) {
179 throw InputStreamException("Failed buffering input stream: " + name
);
182 boundp
= basep
+ bufSize
;
186 /* Read as much as we have room for. */
187 input
.read( cp
, boundp
- cp
);
188 cp
+= input
.gcount();
191 /* Make sure the fail bit didn't get set. */
193 throw InputStreamException("Stream input failed: " + name
);
195 ptrdiff_t offset
= cp
- basep
;
197 assert(offset
<= std::numeric_limits
<uint32_t>::max());
198 inputStringCopy
= (pANTLR3_UINT8
)basep
;
199 inputStream
= newAntrl3InPlaceStream(inputStringCopy
, (uint32_t) offset
, name
);
202 if( inputStream
== NULL
) {
203 throw InputStreamException("Couldn't initialize input: " + name
);
206 return new AntlrInputStream( name
, inputStream
, false, inputStringCopy
);
211 AntlrInputStream::newStringInputStream(const std::string
& input
,
212 const std::string
& name
)
213 throw (InputStreamException
) {
215 size_t input_size
= input
.size();
216 assert(input_size
<= std::numeric_limits
<uint32_t>::max());
218 // Ownership of input_duplicate is transferred to the AntlrInputStream.
219 pANTLR3_UINT8 input_duplicate
= (pANTLR3_UINT8
) strdup(input
.c_str());
221 if( input_duplicate
== NULL
) {
222 throw InputStreamException("Couldn't initialize string input: '" + input
+ "'");
225 pANTLR3_INPUT_STREAM inputStream
= newAntrl3InPlaceStream(input_duplicate
, (uint32_t)input_size
, name
);
226 if( inputStream
==NULL
) {
227 throw InputStreamException("Couldn't initialize string input: '" + input
+ "'");
229 return new AntlrInputStream( name
, inputStream
, false, input_duplicate
);
232 AntlrInput
* AntlrInput::newInput(InputLanguage lang
, AntlrInputStream
& inputStream
) {
233 using namespace language::input
;
239 input
= new CvcInput(inputStream
);
243 input
= new Smt1Input(inputStream
);
246 case LANG_SMTLIB_V2_0
:
247 case LANG_SMTLIB_V2_5
:
248 input
= new Smt2Input(inputStream
, lang
);
252 input
= new SygusInput(inputStream
);
256 input
= new TptpInput(inputStream
);
260 std::stringstream ss
;
261 ss
<< "internal error: unhandled language " << lang
<< " in AntlrInput::newInput";
262 throw InputStreamException(ss
.str());
268 AntlrInput::AntlrInput(AntlrInputStream
& inputStream
, unsigned int lookahead
) :
270 d_lookahead(lookahead
),
273 d_antlr3InputStream( inputStream
.getAntlr3InputStream() ),
274 d_tokenBuffer(NULL
) {
278 AntlrParser::AntlrParser(ExprManager* exprManager, std::istream& input, const std::string& name, unsigned int lookahead)
279 Parser(exprManager,name),
280 d_lookahead(lookahead) {
286 AntlrInput::Input(ExprManager* exprManager, const std::string& input, const std::string& name, unsigned int lookahead) :
287 Input(exprManager,name),
288 d_lookahead(lookahead),
291 d_tokenStream(NULL) {
293 char* inputStr = strdup(input.c_str());
294 char* nameStr = strdup(name.c_str());
295 if( inputStr==NULL || nameStr==NULL ) {
296 throw ParserException("Couldn't initialize string input: '" + input + "'");
298 d_inputStream = antlr3NewAsciiStringInPlaceStream((pANTLR3_UINT8)inputStr,input.size(),(pANTLR3_UINT8)nameStr);
299 if( d_inputStream == NULL ) {
300 throw ParserException("Couldn't create input stream for string: '" + input + "'");
306 AntlrInput::~AntlrInput() {
307 BoundedTokenBufferFree(d_tokenBuffer
);
310 pANTLR3_COMMON_TOKEN_STREAM
AntlrInput::getTokenStream() {
311 return d_tokenBuffer
->commonTstream
;
314 void AntlrInput::lexerError(pANTLR3_BASE_RECOGNIZER recognizer
) {
315 pANTLR3_LEXER lexer
= (pANTLR3_LEXER
)(recognizer
->super
);
317 Parser
*parser
= (Parser
*)(lexer
->super
);
318 assert(parser
!=NULL
);
319 AntlrInput
*input
= (AntlrInput
*) parser
->getInput();
322 /* Call the error display routine *if* there's not already a
323 * parse error pending. If a parser error is pending, this
324 * error is probably less important, so we just drop it. */
325 if( input
->d_parser
->rec
->state
->error
== ANTLR3_FALSE
) {
326 input
->parseError("Error finding next token.");
330 void AntlrInput::warning(const std::string
& message
) {
331 Warning() << getInputStream()->getName() << ':' << d_lexer
->getLine(d_lexer
) << '.' << d_lexer
->getCharPositionInLine(d_lexer
) << ": " << message
<< endl
;
337 * characters considered part of a simple symbol in SMTLIB.
339 * TODO: Ideally this code shouldn't be smtlib specific (should work
340 * with CVC language too), but this per-language specialization has
341 * been left for a later point.
343 inline bool isSimpleChar(char ch
) {
344 return isalnum(ch
) || (strchr("~!@$%^&*_-+=<>.?/", ch
) != NULL
);
347 size_t wholeWordMatch(string input
, string pattern
, bool (*isWordChar
)(char)) {
349 size_t N
= input
.size();
351 while( st
< N
&& (*isWordChar
)(input
[st
]) == false ) st
++;
353 while(en
+ 1 < N
&& (*isWordChar
)(input
[en
+ 1]) == true) en
++;
354 if(en
- st
+ 1 == pattern
.size()) {
356 for(size_t i
= 0; match
&& i
< pattern
.size(); ++i
) {
357 match
&= (pattern
[i
] == input
[st
+i
]);
369 * Gets part of original input and tries to visually hint where the
374 * ...nd (= alpha beta) (= beta delta))
377 * Implementation (as on 2014/04/24):
379 * > if suggested pointer by lexer is under a "simple char", move to
380 * start of the word and print pointer there.
382 * > in the other case, it tries to find the nearest word in the error
383 * message passed along. if it can't find it, we don't add this
384 * visual hint, as experimentally position suggested by lexer was
385 * found to be totally unhelpful. (TODO: fix this upstream to
388 std::string
parseErrorHelper(const char* lineStart
, int charPositionInLine
, const std::string
& message
)
390 // Is it a multi-line message
391 bool multilineMessage
= (message
.find('\n') != string::npos
);
392 // Useful only if it is a multi-line message
393 int firstLineEnd
= message
.find('\n');
395 std::ostringstream ss
, slicess
;
397 // Keep first line of message
398 if(multilineMessage
) {
399 ss
<< message
.substr(0, firstLineEnd
) << endl
<< endl
;
401 ss
<< message
<< endl
<< endl
;
404 int posSliceStart
= (charPositionInLine
- 50 <= 0) ? 0 : charPositionInLine
- 50 + 5;
405 int posSliceEnd
= posSliceStart
+ 70;
407 int caretPosExtra
= 0; // for inital intendation, epilipses etc.
409 ss
<< " "; caretPosExtra
+= 2;
410 if(posSliceStart
> 0) {
411 ss
<< "..."; caretPosExtra
+= 3;
414 for(int i
= posSliceStart
; lineStart
[i
] != '\n'; ++i
) {
415 if(i
== posSliceEnd
) {
419 if(i
< charPositionInLine
) { caretPos
++; }
421 if(!isprint(lineStart
[i
])) {
422 // non-printable character, something wrong, bail out
426 ss
<< (lineStart
[i
]);
427 slicess
<< (lineStart
[i
]);
430 // adjust position of caret, based on slice and message
432 int caretPosOrig
= caretPos
;
433 string slice
= slicess
.str();
434 if(isSimpleChar(slice
[caretPos
])) {
435 // if alphanumeric, try to go to beginning of word/number
436 while(caretPos
> 0 && isSimpleChar(slice
[caretPos
- 1])) { --caretPos
; }
437 if(caretPos
== 0 && posSliceStart
> 0) {
438 // reached start and this is not really the start? bail out
441 // likely it is also in original message? if so, very likely
442 // we found the right place
443 string word
= slice
.substr(caretPos
, (caretPosOrig
- caretPos
+ 1));
444 size_t matchLoc
= wholeWordMatch(message
, word
, isSimpleChar
);
445 Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc
<< endl
;
446 if( matchLoc
!= string::npos
) {
447 Debug("friendlyparser") << "[friendlyparser] Feeling good." << std::endl
;
451 bool foundCaretPos
= false;
453 for(int tries
= 0; tries
< 2 && caretPos
> 0 && !foundCaretPos
; ++tries
) {
454 // go to nearest alphanumeric string (before current position),
455 // see if that word can be found in original message. If so,
456 // point to that, else keep pointer where it was.
457 int nearestWordEn
= caretPos
- 1;
458 while(nearestWordEn
> 0 && !isSimpleChar(slice
[nearestWordEn
])) {
461 if(isSimpleChar(slice
[nearestWordEn
])) {
462 int nearestWordSt
= nearestWordEn
;
463 while(nearestWordSt
> 0 && isSimpleChar(slice
[nearestWordSt
- 1])) {
466 string word
= slice
.substr(nearestWordSt
, (nearestWordEn
- nearestWordSt
+ 1));
467 size_t matchLoc
= wholeWordMatch(message
, word
, isSimpleChar
);
468 Debug("friendlyparser") << "[friendlyparser] nearest word = " << word
<< std::endl
;
469 Debug("friendlyparser") << "[friendlyparser] matchLoc = " << matchLoc
<< endl
;
470 if( matchLoc
!= string::npos
) {
471 Debug("friendlyparser") << "[friendlyparser] strong evidence that caret should be at "
472 << nearestWordSt
<< std::endl
;
473 foundCaretPos
= true;
475 caretPos
= nearestWordSt
;
478 if( !foundCaretPos
) {
479 // this doesn't look good. caret generally getting printed
480 // at unhelpful positions. improve upstream?
484 caretPos
+= caretPosExtra
;
485 }// end of caret position computation/heuristics
488 while( caretPos
-- > 0 ) {
492 if(multilineMessage
) {
493 ss
<< message
.substr(firstLineEnd
, message
.size() - firstLineEnd
);;
498 void AntlrInput::parseError(const std::string
& message
, bool eofException
)
499 throw (ParserException
) {
501 string updatedMessage
= parseErrorHelper((const char*)d_antlr3InputStream
->getLineBuf(d_antlr3InputStream
),
502 d_lexer
->getCharPositionInLine(d_lexer
),
505 Debug("parser") << "Throwing exception: "
506 << (const char*)d_lexer
->rec
->state
->tokSource
->fileName
->chars
<< ":"
507 << d_lexer
->getLine(d_lexer
) << "."
508 << d_lexer
->getCharPositionInLine(d_lexer
) << ": "
509 << updatedMessage
<< endl
;
511 throw ParserEndOfFileException(message
,
512 (const char*)d_lexer
->rec
->state
->tokSource
->fileName
->chars
,
513 d_lexer
->getLine(d_lexer
),
514 d_lexer
->getCharPositionInLine(d_lexer
));
516 throw ParserException(updatedMessage
,
517 (const char*)d_lexer
->rec
->state
->tokSource
->fileName
->chars
,
518 d_lexer
->getLine(d_lexer
),
519 d_lexer
->getCharPositionInLine(d_lexer
));
524 void AntlrInput::setAntlr3Lexer(pANTLR3_LEXER pLexer
) {
527 pANTLR3_TOKEN_FACTORY pTokenFactory
= d_lexer
->rec
->state
->tokFactory
;
528 if( pTokenFactory
!= NULL
) {
529 pTokenFactory
->close(pTokenFactory
);
532 /* 2*lookahead should be sufficient, but we give ourselves some breathing room. */
533 pTokenFactory
= BoundedTokenFactoryNew(d_antlr3InputStream
, 2*d_lookahead
);
534 if( pTokenFactory
== NULL
) {
535 throw InputStreamException("Couldn't create token factory.");
537 d_lexer
->rec
->state
->tokFactory
= pTokenFactory
;
539 pBOUNDED_TOKEN_BUFFER buffer
= BoundedTokenBufferSourceNew(d_lookahead
, d_lexer
->rec
->state
->tokSource
);
540 if( buffer
== NULL
) {
541 throw InputStreamException("Couldn't create token buffer.");
544 d_tokenBuffer
= buffer
;
546 // Override default lexer error reporting
547 d_lexer
->rec
->reportError
= &lexerError
;
548 // Override default nextToken function, just to prevent exceptions escaping.
549 d_lexer
->rec
->state
->tokSource
->nextToken
= &nextToken
;
552 void AntlrInput::setParser(Parser
& parser
) {
553 // ANTLR isn't using super in the lexer or the parser, AFAICT.
554 // We could also use @lexer/parser::context to add a field to the generated
555 // objects, but then it would have to be declared separately in every
556 // language's grammar and we'd have to in the address of the field anyway.
557 d_lexer
->super
= &parser
;
558 d_parser
->super
= &parser
;
561 void AntlrInput::setAntlr3Parser(pANTLR3_PARSER pParser
) {
563 // d_parser->rec->match = &match;
564 d_parser
->rec
->reportError
= &reportError
;
565 /* Don't try to recover from a parse error. */
566 // [chris 4/5/2010] Not clear on why this cast is necessary, but I get an error if I remove it.
567 d_parser
->rec
->recoverFromMismatchedToken
=
568 (void* (*)(ANTLR3_BASE_RECOGNIZER_struct
*, ANTLR3_UINT32
, ANTLR3_BITSET_LIST_struct
*))
569 d_parser
->rec
->mismatch
;
572 }/* CVC4::parser namespace */
573 }/* CVC4 namespace */