1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
13 from soc
.decoder
.selectable_int
import SelectableInt
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38 def python_colonify(lexer
, tokens
):
40 implied_colon_needed
= False
42 #print ("track colon token", token, token.type)
44 if token
.type == 'THEN':
45 # turn then into colon
48 elif token
.type == 'ELSE':
53 elif token
.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
54 implied_colon_needed
= True
56 elif token
.type == 'NEWLINE':
57 if implied_colon_needed
:
61 implied_colon_needed
= False
67 # only care about whitespace at the start of a line
68 def track_tokens_filter(lexer
, tokens
):
69 oldignore
= lexer
.lexignore
70 lexer
.at_line_start
= at_line_start
= True
74 #print ("track token", token, token.type)
75 token
.at_line_start
= at_line_start
77 if token
.type == "COLON":
80 token
.must_indent
= False
82 elif token
.type == "NEWLINE":
84 if indent
== MAY_INDENT
:
86 token
.must_indent
= False
88 elif token
.type == "WS":
89 assert token
.at_line_start
== True
91 token
.must_indent
= False
94 # A real token; only indent after COLON NEWLINE
95 if indent
== MUST_INDENT
:
96 token
.must_indent
= True
98 token
.must_indent
= False
102 # really bad hack that changes ignore lexer state.
103 # when "must indent" is seen (basically "real tokens" seen)
104 # then ignore whitespace.
105 if token
.must_indent
:
106 lexer
.lexignore
= ('ignore', ' ')
108 lexer
.lexignore
= oldignore
110 token
.indent
= indent
112 lexer
.at_line_start
= at_line_start
114 def _new_token(type, lineno
):
122 # Synthesize a DEDENT tag
124 return _new_token("DEDENT", lineno
)
126 # Synthesize an INDENT tag
128 return _new_token("INDENT", lineno
)
131 for i
in range(len(l
)):
136 def annoying_case_hack_filter(code
):
137 """add annoying "silent keyword" (fallthrough)
139 this which tricks the parser into taking the (silent) case statement
140 as a "small expression". it can then be spotted and used to indicate
141 "fall through" to the next case (in the parser)
143 bugs: any function that starts with the letters "case" or "default"
144 will be detected erroneously. fixing that involves doing a token
145 lexer which spots the fact that "case" and "default" are words,
146 separating them from space, colon, bracket etc.
148 http://bugs.libre-riscv.org/show_bug.cgi?id=280
151 prev_spc_count
= None
152 for l
in code
.split("\n"):
153 spc_count
= count_spaces(l
)
154 nwhite
= l
[spc_count
:]
155 if nwhite
.startswith("case") or nwhite
.startswith("default"):
156 #print ("case/default", nwhite, spc_count, prev_spc_count)
157 if (prev_spc_count
is not None and
158 prev_spc_count
== spc_count
and
159 (res
[-1].endswith(":") or res
[-1].endswith(": fallthrough"))):
160 res
[-1] += " fallthrough" # add to previous line
161 prev_spc_count
= spc_count
163 #print ("notstarts", spc_count, nwhite)
164 prev_spc_count
= None
166 return '\n'.join(res
)
169 # Track the indentation level and emit the right INDENT / DEDENT events.
170 def indentation_filter(tokens
):
171 # A stack of indentation levels; will never pop item 0
178 print ("Process", depth
, token
.indent
, token
,)
179 if token
.at_line_start
:
180 print ("at_line_start",)
181 if token
.must_indent
:
182 print ("must_indent",)
185 # WS only occurs at the start of the line
186 # There may be WS followed by NEWLINE so
187 # only track the depth here. Don't indent/dedent
188 # until there's something real.
189 if token
.type == "WS":
191 depth
= len(token
.value
)
193 # WS tokens are never passed to the parser
196 if token
.type == "NEWLINE":
198 if prev_was_ws
or token
.at_line_start
:
201 # pass the other cases on through
205 # then it must be a real token (not WS, not NEWLINE)
206 # which can affect the indentation level
209 if token
.must_indent
:
210 # The current depth must be larger than the previous level
211 if not (depth
> levels
[-1]):
212 raise IndentationError("expected an indented block")
215 yield INDENT(token
.lineno
)
217 elif token
.at_line_start
:
218 # Must be on the same level or one of the previous levels
219 if depth
== levels
[-1]:
222 elif depth
> levels
[-1]:
223 raise IndentationError("indent increase but not in new block")
225 # Back up; but only if it matches a previous level
227 i
= levels
.index(depth
)
229 raise IndentationError("inconsistent indentation")
230 for _
in range(i
+1, len(levels
)):
231 yield DEDENT(token
.lineno
)
236 ### Finished processing ###
238 # Must dedent any remaining levels
240 assert token
is not None
241 for _
in range(1, len(levels
)):
242 yield DEDENT(token
.lineno
)
245 # The top-level filter adds an ENDMARKER, if requested.
246 # Python's grammar uses it.
247 def filter(lexer
, add_endmarker
= True):
249 tokens
= iter(lexer
.token
, None)
250 tokens
= python_colonify(lexer
, tokens
)
251 tokens
= track_tokens_filter(lexer
, tokens
)
252 for token
in indentation_filter(tokens
):
257 if token
is not None:
258 lineno
= token
.lineno
259 yield _new_token("ENDMARKER", lineno
)
276 'NUMBER', # Python decimals
277 'BINARY', # Python binary
278 'STRING', # single quoted strings only; syntax of raw strings
318 def build(self
,**kwargs
):
319 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
322 r
"""0x[0-9a-fA-F_]+"""
323 val
= t
.value
.replace("_", "")
324 t
.value
= SelectableInt(int(val
, 16), (len(val
)-2)*16)
327 def t_BINARY(self
, t
):
329 t
.value
= SelectableInt(int(t
.value
, 2), len(t
.value
)-2)
333 # taken from decmial.py but without the leading sign
334 def t_NUMBER(self
, t
):
335 r
"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
336 t
.value
= int(t
.value
)
339 def t_STRING(self
, t
):
340 r
"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
341 print (repr(t
.value
))
342 t
.value
=t
.value
[1:-1]
347 t_ASSIGNEA
= r
'<-iea'
369 # Ply nicely documented how to do this.
384 "default": "DEFAULT",
388 r
'[a-zA-Z_][a-zA-Z0-9_]*'
389 t
.type = self
.RESERVED
.get(t
.value
, "NAME")
392 # Putting this before t_WS let it consume lines with only comments in
393 # them so the latter code never sees the WS part. Not consuming the
394 # newline. Needed for "if 1: #comment"
395 def t_comment(self
, t
):
396 r
"[ ]*\043[^\n]*" # \043 is '#'
403 if t
.lexer
.at_line_start
and t
.lexer
.paren_count
== 0 and \
404 t
.lexer
.brack_count
== 0:
407 # Don't generate newline tokens when inside of parenthesis, eg
410 def t_newline(self
, t
):
412 t
.lexer
.lineno
+= len(t
.value
)
414 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
417 def t_LBRACK(self
, t
):
419 t
.lexer
.brack_count
+= 1
422 def t_RBRACK(self
, t
):
424 # check for underflow? should be the job of the parser
425 t
.lexer
.brack_count
-= 1
430 t
.lexer
.paren_count
+= 1
435 # check for underflow? should be the job of the parser
436 t
.lexer
.paren_count
-= 1
441 def t_error(self
, t
):
442 raise SyntaxError("Unknown symbol %r" % (t
.value
[0],))
443 print ("Skipping", repr(t
.value
[0]))
447 # Combine Ply and my filters into a new lexer
449 class IndentLexer(PowerLexer
):
450 def __init__(self
, debug
=0, optimize
=0, lextab
='lextab', reflags
=0):
452 self
.build(debug
=debug
, optimize
=optimize
,
453 lextab
=lextab
, reflags
=reflags
)
454 self
.token_stream
= None
456 def input(self
, s
, add_endmarker
=True):
457 s
= annoying_case_hack_filter(s
)
460 self
.lexer
.paren_count
= 0
461 self
.lexer
.brack_count
= 0
463 self
.token_stream
= filter(self
.lexer
, add_endmarker
)
467 return next(self
.token_stream
)
468 except StopIteration:
487 if (RS)[63-n] = 0b1 then
494 if __name__
== '__main__':
501 lexer
= IndentLexer(debug
=1)
502 # Give the lexer some input
507 tokens
= iter(lexer
.token
, None)