1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
13 from openpower
.decoder
.selectable_int
import SelectableInt
15 # I implemented INDENT / DEDENT generation as a post-processing filter
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
40 def python_colonify(lexer
, tokens
):
42 implied_colon_needed
= False
44 #print ("track colon token", token, token.type)
46 if token
.type == 'THEN':
47 # turn then into colon
50 elif token
.type == 'ELSE':
55 elif token
.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
56 implied_colon_needed
= True
58 elif token
.type == 'NEWLINE':
59 if implied_colon_needed
:
63 implied_colon_needed
= False
69 # only care about whitespace at the start of a line
70 def track_tokens_filter(lexer
, tokens
):
71 oldignore
= lexer
.lexignore
72 lexer
.at_line_start
= at_line_start
= True
76 #print ("track token", token, token.type)
77 token
.at_line_start
= at_line_start
79 if token
.type == "COLON":
82 token
.must_indent
= False
84 elif token
.type == "NEWLINE":
86 if indent
== MAY_INDENT
:
88 token
.must_indent
= False
90 elif token
.type == "WS":
91 assert token
.at_line_start
== True
93 token
.must_indent
= False
96 # A real token; only indent after COLON NEWLINE
97 if indent
== MUST_INDENT
:
98 token
.must_indent
= True
100 token
.must_indent
= False
101 at_line_start
= False
104 # really bad hack that changes ignore lexer state.
105 # when "must indent" is seen (basically "real tokens" seen)
106 # then ignore whitespace.
107 if token
.must_indent
:
108 lexer
.lexignore
= ('ignore', ' ')
110 lexer
.lexignore
= oldignore
112 token
.indent
= indent
114 lexer
.at_line_start
= at_line_start
117 def _new_token(type, lineno
):
125 # Synthesize a DEDENT tag
129 return _new_token("DEDENT", lineno
)
131 # Synthesize an INDENT tag
135 return _new_token("INDENT", lineno
)
139 for i
in range(len(l
)):
145 def annoying_case_hack_filter(code
):
146 """add annoying "silent keyword" (fallthrough)
148 this which tricks the parser into taking the (silent) case statement
149 as a "small expression". it can then be spotted and used to indicate
150 "fall through" to the next case (in the parser)
152 also skips blank lines
154 bugs: any function that starts with the letters "case" or "default"
155 will be detected erroneously. fixing that involves doing a token
156 lexer which spots the fact that "case" and "default" are words,
157 separating them from space, colon, bracket etc.
159 http://bugs.libre-riscv.org/show_bug.cgi?id=280
162 prev_spc_count
= None
163 for l
in code
.split("\n"):
164 spc_count
= count_spaces(l
)
165 nwhite
= l
[spc_count
:]
166 if len(nwhite
) == 0: # skip blank lines
168 if nwhite
.startswith("case") or nwhite
.startswith("default"):
169 #print ("case/default", nwhite, spc_count, prev_spc_count)
170 if (prev_spc_count
is not None and
171 prev_spc_count
== spc_count
and
172 (res
[-1].endswith(":") or res
[-1].endswith(": fallthrough"))):
173 res
[-1] += " fallthrough" # add to previous line
174 prev_spc_count
= spc_count
176 #print ("notstarts", spc_count, nwhite)
177 prev_spc_count
= None
179 return '\n'.join(res
)
182 # Track the indentation level and emit the right INDENT / DEDENT events.
183 def indentation_filter(tokens
):
184 # A stack of indentation levels; will never pop item 0
191 print("Process", depth
, token
.indent
, token
,)
192 if token
.at_line_start
:
193 print("at_line_start",)
194 if token
.must_indent
:
195 print("must_indent",)
198 # WS only occurs at the start of the line
199 # There may be WS followed by NEWLINE so
200 # only track the depth here. Don't indent/dedent
201 # until there's something real.
202 if token
.type == "WS":
204 depth
= len(token
.value
)
206 # WS tokens are never passed to the parser
209 if token
.type == "NEWLINE":
211 if prev_was_ws
or token
.at_line_start
:
214 # pass the other cases on through
218 # then it must be a real token (not WS, not NEWLINE)
219 # which can affect the indentation level
222 if token
.must_indent
:
223 # The current depth must be larger than the previous level
224 if not (depth
> levels
[-1]):
225 raise IndentationError("expected an indented block")
228 yield INDENT(token
.lineno
)
230 elif token
.at_line_start
:
231 # Must be on the same level or one of the previous levels
232 if depth
== levels
[-1]:
235 elif depth
> levels
[-1]:
236 raise IndentationError("indent increase but not in new block")
238 # Back up; but only if it matches a previous level
240 i
= levels
.index(depth
)
242 raise IndentationError("inconsistent indentation")
243 for _
in range(i
+1, len(levels
)):
244 yield DEDENT(token
.lineno
)
249 ### Finished processing ###
251 # Must dedent any remaining levels
253 assert token
is not None
254 for _
in range(1, len(levels
)):
255 yield DEDENT(token
.lineno
)
258 # The top-level filter adds an ENDMARKER, if requested.
259 # Python's grammar uses it.
260 def filter(lexer
, add_endmarker
=True):
262 tokens
= iter(lexer
.token
, None)
263 tokens
= python_colonify(lexer
, tokens
)
264 tokens
= track_tokens_filter(lexer
, tokens
)
265 for token
in indentation_filter(tokens
):
270 if token
is not None:
271 lineno
= token
.lineno
272 yield _new_token("ENDMARKER", lineno
)
290 'NUMBER', # Python decimals
291 'BINARY', # Python binary
292 'STRING', # single quoted strings only; syntax of raw strings
332 def build(self
, **kwargs
):
333 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
336 r
"""0x[0-9a-fA-F_]+"""
337 val
= t
.value
.replace("_", "")
338 t
.value
= SelectableInt(int(val
, 16), (len(val
)-2)*4) # hex = nibble
341 def t_BINARY(self
, t
):
343 t
.value
= SelectableInt(int(t
.value
, 2), len(t
.value
)-2)
347 # taken from decmial.py but without the leading sign
348 def t_NUMBER(self
, t
):
349 r
"""[-]?(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
350 t
.value
= int(t
.value
)
353 def t_STRING(self
, t
):
354 r
"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
356 t
.value
= t
.value
[1:-1]
361 t_ASSIGNEA
= r
'<-iea'
383 # Ply nicely documented how to do this.
398 "default": "DEFAULT",
402 r
'[a-zA-Z_][a-zA-Z0-9_]*'
403 t
.type = self
.RESERVED
.get(t
.value
, "NAME")
406 # Putting this before t_WS let it consume lines with only comments in
407 # them so the latter code never sees the WS part. Not consuming the
408 # newline. Needed for "if 1: #comment"
409 def t_comment(self
, t
):
410 r
"[ ]*\043[^\n]*" # \043 is '#'
417 if t
.lexer
.at_line_start
and t
.lexer
.paren_count
== 0 and \
418 t
.lexer
.brack_count
== 0:
421 # Don't generate newline tokens when inside of parenthesis, eg
424 def t_newline(self
, t
):
426 t
.lexer
.lineno
+= len(t
.value
)
428 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
431 def t_LBRACK(self
, t
):
433 t
.lexer
.brack_count
+= 1
436 def t_RBRACK(self
, t
):
438 # check for underflow? should be the job of the parser
439 t
.lexer
.brack_count
-= 1
444 t
.lexer
.paren_count
+= 1
449 # check for underflow? should be the job of the parser
450 t
.lexer
.paren_count
-= 1
455 def t_error(self
, t
):
456 raise SyntaxError("Unknown symbol %r" % (t
.value
[0],))
457 print("Skipping", repr(t
.value
[0]))
461 # Combine Ply and my filters into a new lexer
463 class IndentLexer(PowerLexer
):
464 def __init__(self
, debug
=0, optimize
=0, lextab
='lextab', reflags
=0):
466 self
.build(debug
=debug
, optimize
=optimize
,
467 lextab
=lextab
, reflags
=reflags
)
468 self
.token_stream
= None
470 def input(self
, s
, add_endmarker
=True):
471 s
= annoying_case_hack_filter(s
)
475 self
.lexer
.paren_count
= 0
476 self
.lexer
.brack_count
= 0
478 self
.token_stream
= filter(self
.lexer
, add_endmarker
)
482 return next(self
.token_stream
)
483 except StopIteration:
505 if (RS)[63-n] = 0b1 then
512 if __name__
== '__main__':
519 lexer
= IndentLexer(debug
=1)
520 # Give the lexer some input
525 tokens
= iter(lexer
.token
, None)