1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
13 from soc
.decoder
.selectable_int
import SelectableInt
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38 def python_colonify(lexer
, tokens
):
40 implied_colon_needed
= False
42 #print ("track colon token", token, token.type)
44 if token
.type == 'THEN':
45 # turn then into colon
48 elif token
.type == 'ELSE':
53 elif token
.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
54 implied_colon_needed
= True
56 elif token
.type == 'NEWLINE':
57 if implied_colon_needed
:
61 implied_colon_needed
= False
67 # only care about whitespace at the start of a line
68 def track_tokens_filter(lexer
, tokens
):
69 oldignore
= lexer
.lexignore
70 lexer
.at_line_start
= at_line_start
= True
74 #print ("track token", token, token.type)
75 token
.at_line_start
= at_line_start
77 if token
.type == "COLON":
80 token
.must_indent
= False
82 elif token
.type == "NEWLINE":
84 if indent
== MAY_INDENT
:
86 token
.must_indent
= False
88 elif token
.type == "WS":
89 assert token
.at_line_start
== True
91 token
.must_indent
= False
94 # A real token; only indent after COLON NEWLINE
95 if indent
== MUST_INDENT
:
96 token
.must_indent
= True
98 token
.must_indent
= False
102 # really bad hack that changes ignore lexer state.
103 # when "must indent" is seen (basically "real tokens" seen)
104 # then ignore whitespace.
105 if token
.must_indent
:
106 lexer
.lexignore
= ('ignore', ' ')
108 lexer
.lexignore
= oldignore
110 token
.indent
= indent
112 lexer
.at_line_start
= at_line_start
114 def _new_token(type, lineno
):
122 # Synthesize a DEDENT tag
124 return _new_token("DEDENT", lineno
)
126 # Synthesize an INDENT tag
128 return _new_token("INDENT", lineno
)
131 for i
in range(len(l
)):
136 def annoying_case_hack_filter(code
):
137 """add annoying "silent keyword" (fallthrough)
139 this which tricks the parser into taking the (silent) case statement
140 as a "small expression". it can then be spotted and used to indicate
141 "fall through" to the next case (in the parser)
143 also skips blank lines
145 bugs: any function that starts with the letters "case" or "default"
146 will be detected erroneously. fixing that involves doing a token
147 lexer which spots the fact that "case" and "default" are words,
148 separating them from space, colon, bracket etc.
150 http://bugs.libre-riscv.org/show_bug.cgi?id=280
153 prev_spc_count
= None
154 for l
in code
.split("\n"):
155 spc_count
= count_spaces(l
)
156 nwhite
= l
[spc_count
:]
157 if len(nwhite
) == 0: # skip blank lines
159 if nwhite
.startswith("case") or nwhite
.startswith("default"):
160 #print ("case/default", nwhite, spc_count, prev_spc_count)
161 if (prev_spc_count
is not None and
162 prev_spc_count
== spc_count
and
163 (res
[-1].endswith(":") or res
[-1].endswith(": fallthrough"))):
164 res
[-1] += " fallthrough" # add to previous line
165 prev_spc_count
= spc_count
167 #print ("notstarts", spc_count, nwhite)
168 prev_spc_count
= None
170 return '\n'.join(res
)
173 # Track the indentation level and emit the right INDENT / DEDENT events.
174 def indentation_filter(tokens
):
175 # A stack of indentation levels; will never pop item 0
182 print ("Process", depth
, token
.indent
, token
,)
183 if token
.at_line_start
:
184 print ("at_line_start",)
185 if token
.must_indent
:
186 print ("must_indent",)
189 # WS only occurs at the start of the line
190 # There may be WS followed by NEWLINE so
191 # only track the depth here. Don't indent/dedent
192 # until there's something real.
193 if token
.type == "WS":
195 depth
= len(token
.value
)
197 # WS tokens are never passed to the parser
200 if token
.type == "NEWLINE":
202 if prev_was_ws
or token
.at_line_start
:
205 # pass the other cases on through
209 # then it must be a real token (not WS, not NEWLINE)
210 # which can affect the indentation level
213 if token
.must_indent
:
214 # The current depth must be larger than the previous level
215 if not (depth
> levels
[-1]):
216 raise IndentationError("expected an indented block")
219 yield INDENT(token
.lineno
)
221 elif token
.at_line_start
:
222 # Must be on the same level or one of the previous levels
223 if depth
== levels
[-1]:
226 elif depth
> levels
[-1]:
227 raise IndentationError("indent increase but not in new block")
229 # Back up; but only if it matches a previous level
231 i
= levels
.index(depth
)
233 raise IndentationError("inconsistent indentation")
234 for _
in range(i
+1, len(levels
)):
235 yield DEDENT(token
.lineno
)
240 ### Finished processing ###
242 # Must dedent any remaining levels
244 assert token
is not None
245 for _
in range(1, len(levels
)):
246 yield DEDENT(token
.lineno
)
249 # The top-level filter adds an ENDMARKER, if requested.
250 # Python's grammar uses it.
251 def filter(lexer
, add_endmarker
= True):
253 tokens
= iter(lexer
.token
, None)
254 tokens
= python_colonify(lexer
, tokens
)
255 tokens
= track_tokens_filter(lexer
, tokens
)
256 for token
in indentation_filter(tokens
):
261 if token
is not None:
262 lineno
= token
.lineno
263 yield _new_token("ENDMARKER", lineno
)
280 'NUMBER', # Python decimals
281 'BINARY', # Python binary
282 'STRING', # single quoted strings only; syntax of raw strings
322 def build(self
,**kwargs
):
323 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
326 r
"""0x[0-9a-fA-F_]+"""
327 val
= t
.value
.replace("_", "")
328 t
.value
= SelectableInt(int(val
, 16), (len(val
)-2)*16)
331 def t_BINARY(self
, t
):
333 t
.value
= SelectableInt(int(t
.value
, 2), len(t
.value
)-2)
337 # taken from decmial.py but without the leading sign
338 def t_NUMBER(self
, t
):
339 r
"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
340 t
.value
= int(t
.value
)
343 def t_STRING(self
, t
):
344 r
"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
345 print (repr(t
.value
))
346 t
.value
=t
.value
[1:-1]
351 t_ASSIGNEA
= r
'<-iea'
373 # Ply nicely documented how to do this.
388 "default": "DEFAULT",
392 r
'[a-zA-Z_][a-zA-Z0-9_]*'
393 t
.type = self
.RESERVED
.get(t
.value
, "NAME")
396 # Putting this before t_WS let it consume lines with only comments in
397 # them so the latter code never sees the WS part. Not consuming the
398 # newline. Needed for "if 1: #comment"
399 def t_comment(self
, t
):
400 r
"[ ]*\043[^\n]*" # \043 is '#'
407 if t
.lexer
.at_line_start
and t
.lexer
.paren_count
== 0 and \
408 t
.lexer
.brack_count
== 0:
411 # Don't generate newline tokens when inside of parenthesis, eg
414 def t_newline(self
, t
):
416 t
.lexer
.lineno
+= len(t
.value
)
418 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
421 def t_LBRACK(self
, t
):
423 t
.lexer
.brack_count
+= 1
426 def t_RBRACK(self
, t
):
428 # check for underflow? should be the job of the parser
429 t
.lexer
.brack_count
-= 1
434 t
.lexer
.paren_count
+= 1
439 # check for underflow? should be the job of the parser
440 t
.lexer
.paren_count
-= 1
445 def t_error(self
, t
):
446 raise SyntaxError("Unknown symbol %r" % (t
.value
[0],))
447 print ("Skipping", repr(t
.value
[0]))
451 # Combine Ply and my filters into a new lexer
453 class IndentLexer(PowerLexer
):
454 def __init__(self
, debug
=0, optimize
=0, lextab
='lextab', reflags
=0):
456 self
.build(debug
=debug
, optimize
=optimize
,
457 lextab
=lextab
, reflags
=reflags
)
458 self
.token_stream
= None
460 def input(self
, s
, add_endmarker
=True):
461 s
= annoying_case_hack_filter(s
)
465 self
.lexer
.paren_count
= 0
466 self
.lexer
.brack_count
= 0
468 self
.token_stream
= filter(self
.lexer
, add_endmarker
)
472 return next(self
.token_stream
)
473 except StopIteration:
494 if (RS)[63-n] = 0b1 then
501 if __name__
== '__main__':
508 lexer
= IndentLexer(debug
=1)
509 # Give the lexer some input
514 tokens
= iter(lexer
.token
, None)