1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
13 from openpower
.decoder
.selectable_int
import SelectableInt
16 class SyntaxError2(Exception):
17 """ class used to raise a syntax error but get ply to stop eating errors
18 since it catches and discards SyntaxError after setting a flag.
21 def __init__(self
, *args
, cls
=SyntaxError):
22 super().__init
__(*args
)
26 return repr(self
.cls(*self
.args
))
29 return str(self
.cls(*self
.args
))
31 def raise_syntax_error(self
):
32 raise self
.cls(*self
.args
) from self
35 def raise_syntax_error(msg
, filename
, lineno
, lexpos
, input_text
,
37 line_start
= input_text
.rfind('\n', 0, lexpos
) + 1
38 line_end
= input_text
.find('\n', line_start
)
39 col
= (lexpos
- line_start
) + 1
40 raise SyntaxError2(str(msg
), (filename
, lineno
, col
,
41 input_text
[line_start
:line_end
]), cls
=cls
)
43 # I implemented INDENT / DEDENT generation as a post-processing filter
45 # The original lex token stream contains WS and NEWLINE characters.
46 # WS will only occur before any other tokens on a line.
48 # I have three filters. One tags tokens by adding two attributes.
49 # "must_indent" is True if the token must be indented from the
50 # previous code. The other is "at_line_start" which is True for WS
51 # and the first non-WS/non-NEWLINE on a line. It flags the check so
52 # see if the new line has changed indication level.
54 # Python's syntax has three INDENT states
55 # 0) no colon hence no need to indent
56 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
57 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
62 # turn into python-like colon syntax from pseudo-code syntax.
63 # identify tokens which tell us whether a "hidden colon" is needed.
64 # this in turn means that track_tokens_filter "works" without needing
65 # complex grammar rules
68 def python_colonify(lexer
, tokens
):
70 implied_colon_needed
= False
72 #print ("track colon token", token, token.type)
74 if token
.type == 'THEN':
75 # turn then into colon
78 elif token
.type == 'ELSE':
83 elif token
.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
84 implied_colon_needed
= True
86 elif token
.type == 'NEWLINE':
87 if implied_colon_needed
:
91 implied_colon_needed
= False
97 # only care about whitespace at the start of a line
98 def track_tokens_filter(lexer
, tokens
):
99 oldignore
= lexer
.lexignore
100 lexer
.at_line_start
= at_line_start
= True
104 #print ("track token", token, token.type)
105 token
.at_line_start
= at_line_start
107 if token
.type == "COLON":
108 at_line_start
= False
110 token
.must_indent
= False
112 elif token
.type == "NEWLINE":
114 if indent
== MAY_INDENT
:
116 token
.must_indent
= False
118 elif token
.type == "WS":
119 assert token
.at_line_start
== True
121 token
.must_indent
= False
124 # A real token; only indent after COLON NEWLINE
125 if indent
== MUST_INDENT
:
126 token
.must_indent
= True
128 token
.must_indent
= False
129 at_line_start
= False
132 # really bad hack that changes ignore lexer state.
133 # when "must indent" is seen (basically "real tokens" seen)
134 # then ignore whitespace.
135 if token
.must_indent
:
136 lexer
.lexignore
= ('ignore', ' ')
138 lexer
.lexignore
= oldignore
140 token
.indent
= indent
142 lexer
.at_line_start
= at_line_start
145 def _new_token(type, lineno
):
153 # Synthesize a DEDENT tag
157 return _new_token("DEDENT", lineno
)
159 # Synthesize an INDENT tag
163 return _new_token("INDENT", lineno
)
167 for i
in range(len(l
)):
173 def annoying_case_hack_filter(code
):
174 """add annoying "silent keyword" (fallthrough)
176 this which tricks the parser into taking the (silent) case statement
177 as a "small expression". it can then be spotted and used to indicate
178 "fall through" to the next case (in the parser)
180 also skips blank lines
182 bugs: any function that starts with the letters "case" or "default"
183 will be detected erroneously. fixing that involves doing a token
184 lexer which spots the fact that "case" and "default" are words,
185 separating them from space, colon, bracket etc.
187 http://bugs.libre-riscv.org/show_bug.cgi?id=280
190 prev_spc_count
= None
191 for l
in code
.split("\n"):
192 spc_count
= count_spaces(l
)
193 nwhite
= l
[spc_count
:]
194 if len(nwhite
) == 0: # skip blank lines
197 if nwhite
.startswith("case") or nwhite
.startswith("default"):
198 #print ("case/default", nwhite, spc_count, prev_spc_count)
199 if (prev_spc_count
is not None and
200 prev_spc_count
== spc_count
and
201 (res
[-1].endswith(":") or res
[-1].endswith(": fallthrough"))):
202 res
[-1] += " fallthrough" # add to previous line
203 prev_spc_count
= spc_count
205 #print ("notstarts", spc_count, nwhite)
206 prev_spc_count
= None
208 return '\n'.join(res
)
211 # Track the indentation level and emit the right INDENT / DEDENT events.
212 def indentation_filter(tokens
, filename
):
213 # A stack of indentation levels; will never pop item 0
220 print("Process", depth
, token
.indent
, token
,)
221 if token
.at_line_start
:
222 print("at_line_start",)
223 if token
.must_indent
:
224 print("must_indent",)
227 # WS only occurs at the start of the line
228 # There may be WS followed by NEWLINE so
229 # only track the depth here. Don't indent/dedent
230 # until there's something real.
231 if token
.type == "WS":
233 depth
= len(token
.value
)
235 # WS tokens are never passed to the parser
238 if token
.type == "NEWLINE":
240 if prev_was_ws
or token
.at_line_start
:
243 # pass the other cases on through
247 # then it must be a real token (not WS, not NEWLINE)
248 # which can affect the indentation level
251 if token
.must_indent
:
252 # The current depth must be larger than the previous level
253 if not (depth
> levels
[-1]):
254 raise_syntax_error("expected an indented block",
255 filename
, token
.lexer
.lineno
,
256 token
.lexer
.lexpos
, token
.lexer
.lexdata
,
257 cls
=IndentationError)
260 yield INDENT(token
.lineno
)
262 elif token
.at_line_start
:
263 # Must be on the same level or one of the previous levels
264 if depth
== levels
[-1]:
267 elif depth
> levels
[-1]:
268 raise_syntax_error("indent increase but not in new block",
269 filename
, token
.lexer
.lineno
,
270 token
.lexer
.lexpos
, token
.lexer
.lexdata
,
271 cls
=IndentationError)
273 # Back up; but only if it matches a previous level
275 i
= levels
.index(depth
)
277 raise_syntax_error("inconsistent indentation",
278 filename
, token
.lexer
.lineno
,
279 token
.lexer
.lexpos
, token
.lexer
.lexdata
,
280 cls
=IndentationError)
281 for _
in range(i
+1, len(levels
)):
282 yield DEDENT(token
.lineno
)
287 ### Finished processing ###
289 # Must dedent any remaining levels
291 assert token
is not None
292 for _
in range(1, len(levels
)):
293 yield DEDENT(token
.lineno
)
296 # The top-level filter adds an ENDMARKER, if requested.
297 # Python's grammar uses it.
298 def filter(lexer
, add_endmarker
, filename
):
300 tokens
= iter(lexer
.token
, None)
301 tokens
= python_colonify(lexer
, tokens
)
302 tokens
= track_tokens_filter(lexer
, tokens
)
303 for token
in indentation_filter(tokens
, filename
):
308 if token
is not None:
309 lineno
= token
.lineno
310 yield _new_token("ENDMARKER", lineno
)
313 KEYWORD_REPLACEMENTS
= {'class': 'class_'}
331 'NUMBER', # Python decimals
332 'BINARY', # Python binary
333 'STRING', # single quoted strings only; syntax of raw strings
377 def build(self
, **kwargs
):
378 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
382 r
"""0x[0-9a-fA-F_]+"""
383 val
= t
.value
.replace("_", "")
384 t
.value
= SelectableInt(int(val
, 16), (len(val
)-2)*4) # hex = nibble
387 def t_BINARY(self
, t
):
389 val
= t
.value
.replace("_", "")
390 t
.value
= SelectableInt(int(val
, 2), len(val
)-2)
394 # taken from decmial.py but without the leading sign
395 def t_NUMBER(self
, t
):
396 r
"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
397 t
.value
= int(t
.value
)
400 def t_STRING(self
, t
):
401 r
"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
403 t
.value
= t
.value
[1:-1]
408 t_ASSIGNEA
= r
'<-iea'
434 # Ply nicely documented how to do this.
449 "default": "DEFAULT",
453 r
'[a-zA-Z_][a-zA-Z0-9_]*'
454 t
.type = self
.RESERVED
.get(t
.value
, "NAME")
455 if t
.value
in KEYWORD_REPLACEMENTS
:
456 t
.value
= KEYWORD_REPLACEMENTS
[t
.value
]
459 # Putting this before t_WS let it consume lines with only comments in
460 # them so the latter code never sees the WS part. Not consuming the
461 # newline. Needed for "if 1: #comment"
462 def t_comment(self
, t
):
463 r
"[ ]*\043[^\n]*" # \043 is '#'
470 if t
.lexer
.at_line_start
and t
.lexer
.paren_count
== 0 and \
471 t
.lexer
.brack_count
== 0:
474 # Don't generate newline tokens when inside of parenthesis, eg
477 def t_newline(self
, t
):
479 t
.lexer
.lineno
+= len(t
.value
)
481 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
484 def t_LBRACK(self
, t
):
486 t
.lexer
.brack_count
+= 1
489 def t_RBRACK(self
, t
):
491 # check for underflow? should be the job of the parser
492 t
.lexer
.brack_count
-= 1
497 t
.lexer
.paren_count
+= 1
502 # check for underflow? should be the job of the parser
503 t
.lexer
.paren_count
-= 1
508 def t_error(self
, t
):
509 raise_syntax_error("Unknown symbol %r" % (t
.value
[0],),
510 self
.filename
, t
.lexer
.lineno
,
511 t
.lexer
.lexpos
, t
.lexer
.lexdata
)
512 print("Skipping", repr(t
.value
[0]))
516 # Combine Ply and my filters into a new lexer
518 class IndentLexer(PowerLexer
):
519 def __init__(self
, debug
=0, optimize
=0, lextab
='lextab', reflags
=0):
521 self
.build(debug
=debug
, optimize
=optimize
,
522 lextab
=lextab
, reflags
=reflags
)
523 self
.token_stream
= None
525 def input(self
, s
, add_endmarker
=True):
526 s
= annoying_case_hack_filter(s
)
530 self
.lexer
.paren_count
= 0
531 self
.lexer
.brack_count
= 0
532 self
.lexer
.lineno
= 1
534 self
.token_stream
= filter(self
.lexer
, add_endmarker
, self
.filename
)
538 return next(self
.token_stream
)
539 except StopIteration:
561 if (RS)[63-n] = 0b1 then
568 if __name__
== '__main__':
575 lexer
= IndentLexer(debug
=1)
576 # Give the lexer some input
581 tokens
= iter(lexer
.token
, None)