1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
16 import openpower
.oppc
.pc_ast
as pc_ast
21 t
.value
= cls(t
.value
)
24 wrapper
.__name
__ = cls
.__name
__
25 wrapper
.__doc
__ = cls
.__doc
__
30 class SyntaxError2(Exception):
31 """ class used to raise a syntax error but get ply to stop eating errors
32 since it catches and discards SyntaxError after setting a flag.
35 def __init__(self
, *args
, cls
=SyntaxError):
36 super().__init
__(*args
)
40 return repr(self
.cls(*self
.args
))
43 return str(self
.cls(*self
.args
))
45 def raise_syntax_error(self
):
46 raise self
.cls(*self
.args
) from self
49 def raise_syntax_error(msg
, filename
, lineno
, lexpos
, input_text
,
51 line_start
= input_text
.rfind("\n", 0, lexpos
) + 1
52 line_end
= input_text
.find("\n", line_start
)
53 col
= (lexpos
- line_start
) + 1
54 raise SyntaxError2(str(msg
), (filename
, lineno
, col
,
55 input_text
[line_start
:line_end
]), cls
=cls
)
57 # I implemented INDENT / DEDENT generation as a post-processing filter
59 # The original lex token stream contains WS and NEWLINE characters.
60 # WS will only occur before any other tokens on a line.
62 # I have three filters. One tags tokens by adding two attributes.
63 # "must_indent" is True if the token must be indented from the
64 # previous code. The other is "at_line_start" which is True for WS
65 # and the first non-WS/non-NEWLINE on a line. It flags the check so
66 # see if the new line has changed indication level.
68 # Python's syntax has three INDENT states
69 # 0) no colon hence no need to indent
70 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
71 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
76 # turn into python-like colon syntax from pseudo-code syntax.
77 # identify tokens which tell us whether a "hidden colon" is needed.
78 # this in turn means that track_tokens_filter "works" without needing
79 # complex grammar rules
82 def python_colonify(lexer
, tokens
):
83 implied_colon_needed
= False
85 if token
.type == "THEN":
86 # turn then into colon
88 token
.value
= pc_ast
.Colon(str(token
.value
))
90 elif token
.type == "ELSE":
94 token
.value
= pc_ast
.Colon(str(token
.value
))
96 elif token
.type in ["DO", "WHILE", "FOR", "SWITCH"]:
97 implied_colon_needed
= True
99 elif token
.type == "NEWLINE":
100 if implied_colon_needed
:
103 ctok
.value
= pc_ast
.Colon(str(token
.value
))
105 implied_colon_needed
= False
111 # only care about whitespace at the start of a line
112 def track_tokens_filter(lexer
, tokens
):
113 oldignore
= lexer
.lexignore
114 lexer
.at_line_start
= at_line_start
= True
118 token
.at_line_start
= at_line_start
120 if token
.type == "COLON":
121 at_line_start
= False
123 token
.must_indent
= False
125 elif token
.type == "NEWLINE":
127 if indent
== MAY_INDENT
:
129 token
.must_indent
= False
131 elif token
.type == "WS":
132 assert token
.at_line_start
== True
134 token
.must_indent
= False
137 # A real token; only indent after COLON NEWLINE
138 if indent
== MUST_INDENT
:
139 token
.must_indent
= True
141 token
.must_indent
= False
142 at_line_start
= False
145 # really bad hack that changes ignore lexer state.
146 # when "must indent" is seen (basically "real tokens" seen)
147 # then ignore whitespace.
148 if token
.must_indent
:
149 lexer
.lexignore
= ("ignore", " ")
151 lexer
.lexignore
= oldignore
153 token
.indent
= indent
155 lexer
.at_line_start
= at_line_start
158 def _new_token(type, lineno
):
160 "ENDMARKER": pc_ast
.Endmarker
,
161 "INDENT": pc_ast
.Indent
,
162 "DEDENT": pc_ast
.Dedent
,
171 # Synthesize a DEDENT tag
175 return _new_token("DEDENT", lineno
)
177 # Synthesize an INDENT tag
181 return _new_token("INDENT", lineno
)
185 for i
in range(len(l
)):
191 def annoying_case_hack_filter(code
):
192 """add annoying "silent keyword" (fallthrough)
194 this which tricks the parser into taking the (silent) case statement
195 as a "small expression". it can then be spotted and used to indicate
196 "fall through" to the next case (in the parser)
198 also skips blank lines
200 bugs: any function that starts with the letters "case" or "default"
201 will be detected erroneously. fixing that involves doing a token
202 lexer which spots the fact that "case" and "default" are words,
203 separating them from space, colon, bracket etc.
205 http://bugs.libre-riscv.org/show_bug.cgi?id=280
208 prev_spc_count
= None
209 for l
in code
.split("\n"):
210 spc_count
= count_spaces(l
)
211 nwhite
= l
[spc_count
:]
212 if len(nwhite
) == 0: # skip blank lines
215 if nwhite
.startswith("case") or nwhite
.startswith("default"):
216 if (prev_spc_count
is not None and
217 prev_spc_count
== spc_count
and
218 (res
[-1].endswith(":") or res
[-1].endswith(": fallthrough"))):
219 res
[-1] += " fallthrough" # add to previous line
220 prev_spc_count
= spc_count
222 prev_spc_count
= None
224 return "\n".join(res
)
227 # Track the indentation level and emit the right INDENT / DEDENT events.
228 def indentation_filter(tokens
, filename
):
229 # A stack of indentation levels; will never pop item 0
235 # WS only occurs at the start of the line
236 # There may be WS followed by NEWLINE so
237 # only track the depth here. Don't indent/dedent
238 # until there's something real.
239 if token
.type == "WS":
241 depth
= len(token
.value
)
243 # WS tokens are never passed to the parser
246 if token
.type == "NEWLINE":
248 if prev_was_ws
or token
.at_line_start
:
251 # pass the other cases on through
255 # then it must be a real token (not WS, not NEWLINE)
256 # which can affect the indentation level
259 if token
.must_indent
:
260 # The current depth must be larger than the previous level
261 if not (depth
> levels
[-1]):
262 raise_syntax_error("expected an indented block",
263 filename
, token
.lexer
.lineno
,
264 token
.lexer
.lexpos
, token
.lexer
.lexdata
,
265 cls
=IndentationError)
268 yield INDENT(token
.lineno
)
270 elif token
.at_line_start
:
271 # Must be on the same level or one of the previous levels
272 if depth
== levels
[-1]:
275 elif depth
> levels
[-1]:
276 raise_syntax_error("indent increase but not in new block",
277 filename
, token
.lexer
.lineno
,
278 token
.lexer
.lexpos
, token
.lexer
.lexdata
,
279 cls
=IndentationError)
281 # Back up; but only if it matches a previous level
283 i
= levels
.index(depth
)
285 raise_syntax_error("inconsistent indentation",
286 filename
, token
.lexer
.lineno
,
287 token
.lexer
.lexpos
, token
.lexer
.lexdata
,
288 cls
=IndentationError)
289 for _
in range(i
+1, len(levels
)):
290 yield DEDENT(token
.lineno
)
295 ### Finished processing ###
297 # Must dedent any remaining levels
299 assert token
is not None
300 for _
in range(1, len(levels
)):
301 yield DEDENT(token
.lineno
)
304 # The top-level filter adds an ENDMARKER, if requested.
305 # Python's grammar uses it.
306 def filter(lexer
, add_endmarker
, filename
):
308 tokens
= iter(lexer
.token
, None)
309 tokens
= python_colonify(lexer
, tokens
)
310 tokens
= track_tokens_filter(lexer
, tokens
)
311 for token
in indentation_filter(tokens
, filename
):
316 if token
is not None:
317 lineno
= token
.lineno
318 yield _new_token("ENDMARKER", lineno
)
335 "NUMBER", # Python decimals
336 "BINARY", # Python binary
337 "STRING", # single quoted strings only; syntax of raw strings
381 def build(self
, **kwargs
):
382 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
385 @lex.TOKEN(pc_ast
.HexLiteral
.__doc
__)
387 t
.value
= pc_ast
.HexLiteral(t
.value
)
390 @lex.TOKEN(pc_ast
.BinLiteral
.__doc
__)
391 def t_BINARY(self
, t
):
392 t
.value
= pc_ast
.BinLiteral(t
.value
)
395 @lex.TOKEN(pc_ast
.DecLiteral
.__doc
__)
396 def t_NUMBER(self
, t
):
397 t
.value
= pc_ast
.DecLiteral(t
.value
)
400 @lex.TOKEN(pc_ast
.StringLiteral
.__doc
__)
401 def t_STRING(self
, t
):
402 t
.value
= pc_ast
.StringLiteral(t
.value
[1:-1])
405 t_COLON
= pc_ast
.Colon
.__doc
__
406 t_EQ
= pc_ast
.Eq
.__doc
__
407 t_ASSIGNEA
= pc_ast
.AssignIEAOp
.__doc
__
408 t_ASSIGN
= pc_ast
.AssignOp
.__doc
__
409 t_LTU
= pc_ast
.LtU
.__doc
__
410 t_GTU
= pc_ast
.GtU
.__doc
__
411 t_NE
= pc_ast
.NotEq
.__doc
__
412 t_LE
= pc_ast
.Le
.__doc
__
413 t_GE
= pc_ast
.Ge
.__doc
__
414 t_LSHIFT
= pc_ast
.LShift
.__doc
__
415 t_RSHIFT
= pc_ast
.RShift
.__doc
__
416 t_LT
= pc_ast
.Lt
.__doc
__
417 t_GT
= pc_ast
.Gt
.__doc
__
418 t_PLUS
= pc_ast
.Add
.__doc
__
419 t_MINUS
= pc_ast
.Sub
.__doc
__
420 t_MULT
= pc_ast
.Mul
.__doc
__
421 t_DIV
= pc_ast
.Div
.__doc
__
422 t_MOD
= pc_ast
.Mod
.__doc
__
423 t_INVERT
= pc_ast
.Not
.__doc
__
424 t_COMMA
= pc_ast
.Comma
.__doc
__
425 t_PERIOD
= pc_ast
.Period
.__doc
__
426 t_SEMICOLON
= pc_ast
.Semicolon
.__doc
__
427 t_APPEND
= pc_ast
.BitConcat
.__doc
__
428 t_BITOR
= pc_ast
.BitOr
.__doc
__
429 t_BITAND
= pc_ast
.BitAnd
.__doc
__
430 t_BITXOR
= pc_ast
.BitXor
.__doc
__
431 t_QMARK
= pc_ast
.Question
.__doc
__
433 @lex.TOKEN(pc_ast
.Symbol
)
436 "def": ("DEF", pc_ast
.FunctionKeyword
),
437 "if": ("IF", pc_ast
.IfKeyword
),
438 "then": ("THEN", pc_ast
.ThenKeyword
),
439 "else": ("ELSE", pc_ast
.ElseKeyword
),
440 "leave": ("BREAK", pc_ast
.LeaveKeyword
),
441 "for": ("FOR", pc_ast
.ForKeyword
),
442 "to": ("TO", pc_ast
.ToKeyword
),
443 "while": ("WHILE", pc_ast
.WhileKeyword
),
444 "do": ("DO", pc_ast
.DoKeyword
),
445 "return": ("RETURN", pc_ast
.ReturnKeyword
),
446 "switch": ("SWITCH", pc_ast
.SwitchKeyword
),
447 "case": ("CASE", pc_ast
.CaseKeyword
),
448 "default": ("DEFAULT", pc_ast
.DefaultKeyword
),
450 (tt
, tcls
) = keywords
.get(t
.value
, ("NAME", pc_ast
.Symbol
))
452 t
.value
= tcls(t
.value
)
455 # Putting this before t_WS let it consume lines with only comments in
456 # them so the latter code never sees the WS part. Not consuming the
457 # newline. Needed for "if 1: #comment"
458 @lex.TOKEN(pc_ast
.Comment
.__doc
__)
459 def t_comment(self
, t
):
463 @lex.TOKEN(pc_ast
.Whitespace
.__doc
__)
465 if (t
.lexer
.at_line_start
and
466 t
.lexer
.paren_count
== 0 and \
467 t
.lexer
.brack_count
== 0):
470 # Don't generate newline tokens when inside of parenthesis, eg
473 @lex.TOKEN(pc_ast
.Linebreak
.__doc
__)
474 def t_newline(self
, t
):
475 t
.lexer
.lineno
+= len(t
.value
)
476 t
.value
= pc_ast
.Linebreak(t
.value
)
478 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
481 @lex.TOKEN(pc_ast
.LBracket
.__doc
__)
482 def t_LBRACK(self
, t
):
483 t
.lexer
.brack_count
+= 1
484 t
.value
= pc_ast
.LBracket(t
.value
)
487 @lex.TOKEN(pc_ast
.RBracket
.__doc
__)
488 def t_RBRACK(self
, t
):
489 t
.lexer
.brack_count
-= 1
490 t
.value
= pc_ast
.RBracket(t
.value
)
493 @lex.TOKEN(pc_ast
.LParenthesis
.__doc
__)
495 t
.lexer
.paren_count
+= 1
496 t
.value
= pc_ast
.LParenthesis(t
.value
)
499 @lex.TOKEN(pc_ast
.RParenthesis
.__doc
__)
501 t
.lexer
.paren_count
-= 1
502 t
.value
= pc_ast
.RParenthesis(t
.value
)
505 def t_error(self
, t
):
506 raise_syntax_error("Unknown symbol %r" % (t
.value
[0],),
507 self
.filename
, t
.lexer
.lineno
,
508 t
.lexer
.lexpos
, t
.lexer
.lexdata
)
512 # Combine Ply and my filters into a new lexer
514 class IndentLexer(Lexer
):
515 def __init__(self
, debug
=False, optimize
=False, lextab
="lextab"):
517 self
.build(debug
=debug
, optimize
=optimize
, lextab
=lextab
)
518 self
.token_stream
= None
520 def input(self
, s
, add_endmarker
=True):
521 s
= annoying_case_hack_filter(s
)
523 self
.lexer
.paren_count
= 0
524 self
.lexer
.brack_count
= 0
525 self
.lexer
.lineno
= 1
527 self
.token_stream
= filter(self
.lexer
, add_endmarker
, self
.filename
)
530 # The simplest way to convert "simple" tokens to classes.
531 # Functions won't work due to ply reliability on __code__.
532 # We end up with (LT+MINUS) instead of ASSIGN otherwise.
534 "COLON": pc_ast
.Colon
,
536 "ASSIGNEA": pc_ast
.AssignIEAOp
,
537 "ASSIGN": pc_ast
.AssignOp
,
543 "LSHIFT": pc_ast
.LShift
,
544 "RSHIFT": pc_ast
.RShift
,
552 "INVERT": pc_ast
.Not
,
553 "COMMA": pc_ast
.Comma
,
554 "PERIOD": pc_ast
.Period
,
555 "SEMICOLON": pc_ast
.Semicolon
,
556 "APPEND": pc_ast
.BitConcat
,
557 "BITOR": pc_ast
.BitOr
,
558 "BITAND": pc_ast
.BitAnd
,
559 "BITXOR": pc_ast
.BitXor
,
560 "QMARK": pc_ast
.Question
,
563 t
= next(self
.token_stream
)
565 if t
.type in mapping
:
566 t
.value
= mapping
[t
.type](t
.value
)
568 except StopIteration: