1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
13 from openpower
.decoder
.selectable_int
import SelectableInt
16 def raise_syntax_error(msg
, filename
, lineno
, lexpos
, input_text
):
17 line_start
= input_text
.rfind('\n', 0, lexpos
) + 1
18 line_end
= input_text
.find('\n', line_start
)
19 col
= (lexpos
- line_start
) + 1
20 raise SyntaxError(str(msg
), (filename
, lineno
, col
,
21 input_text
[line_start
:line_end
]))
23 # I implemented INDENT / DEDENT generation as a post-processing filter
25 # The original lex token stream contains WS and NEWLINE characters.
26 # WS will only occur before any other tokens on a line.
28 # I have three filters. One tags tokens by adding two attributes.
29 # "must_indent" is True if the token must be indented from the
30 # previous code. The other is "at_line_start" which is True for WS
31 # and the first non-WS/non-NEWLINE on a line. It flags the check so
32 # see if the new line has changed indication level.
34 # Python's syntax has three INDENT states
35 # 0) no colon hence no need to indent
36 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
37 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
42 # turn into python-like colon syntax from pseudo-code syntax.
43 # identify tokens which tell us whether a "hidden colon" is needed.
44 # this in turn means that track_tokens_filter "works" without needing
45 # complex grammar rules
48 def python_colonify(lexer
, tokens
):
50 implied_colon_needed
= False
52 #print ("track colon token", token, token.type)
54 if token
.type == 'THEN':
55 # turn then into colon
58 elif token
.type == 'ELSE':
63 elif token
.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
64 implied_colon_needed
= True
66 elif token
.type == 'NEWLINE':
67 if implied_colon_needed
:
71 implied_colon_needed
= False
77 # only care about whitespace at the start of a line
78 def track_tokens_filter(lexer
, tokens
):
79 oldignore
= lexer
.lexignore
80 lexer
.at_line_start
= at_line_start
= True
84 #print ("track token", token, token.type)
85 token
.at_line_start
= at_line_start
87 if token
.type == "COLON":
90 token
.must_indent
= False
92 elif token
.type == "NEWLINE":
94 if indent
== MAY_INDENT
:
96 token
.must_indent
= False
98 elif token
.type == "WS":
99 assert token
.at_line_start
== True
101 token
.must_indent
= False
104 # A real token; only indent after COLON NEWLINE
105 if indent
== MUST_INDENT
:
106 token
.must_indent
= True
108 token
.must_indent
= False
109 at_line_start
= False
112 # really bad hack that changes ignore lexer state.
113 # when "must indent" is seen (basically "real tokens" seen)
114 # then ignore whitespace.
115 if token
.must_indent
:
116 lexer
.lexignore
= ('ignore', ' ')
118 lexer
.lexignore
= oldignore
120 token
.indent
= indent
122 lexer
.at_line_start
= at_line_start
125 def _new_token(type, lineno
):
133 # Synthesize a DEDENT tag
137 return _new_token("DEDENT", lineno
)
139 # Synthesize an INDENT tag
143 return _new_token("INDENT", lineno
)
147 for i
in range(len(l
)):
153 def annoying_case_hack_filter(code
):
154 """add annoying "silent keyword" (fallthrough)
156 this which tricks the parser into taking the (silent) case statement
157 as a "small expression". it can then be spotted and used to indicate
158 "fall through" to the next case (in the parser)
160 also skips blank lines
162 bugs: any function that starts with the letters "case" or "default"
163 will be detected erroneously. fixing that involves doing a token
164 lexer which spots the fact that "case" and "default" are words,
165 separating them from space, colon, bracket etc.
167 http://bugs.libre-riscv.org/show_bug.cgi?id=280
170 prev_spc_count
= None
171 for l
in code
.split("\n"):
172 spc_count
= count_spaces(l
)
173 nwhite
= l
[spc_count
:]
174 if len(nwhite
) == 0: # skip blank lines
177 if nwhite
.startswith("case") or nwhite
.startswith("default"):
178 #print ("case/default", nwhite, spc_count, prev_spc_count)
179 if (prev_spc_count
is not None and
180 prev_spc_count
== spc_count
and
181 (res
[-1].endswith(":") or res
[-1].endswith(": fallthrough"))):
182 res
[-1] += " fallthrough" # add to previous line
183 prev_spc_count
= spc_count
185 #print ("notstarts", spc_count, nwhite)
186 prev_spc_count
= None
188 return '\n'.join(res
)
191 # Track the indentation level and emit the right INDENT / DEDENT events.
192 def indentation_filter(tokens
):
193 # A stack of indentation levels; will never pop item 0
200 print("Process", depth
, token
.indent
, token
,)
201 if token
.at_line_start
:
202 print("at_line_start",)
203 if token
.must_indent
:
204 print("must_indent",)
207 # WS only occurs at the start of the line
208 # There may be WS followed by NEWLINE so
209 # only track the depth here. Don't indent/dedent
210 # until there's something real.
211 if token
.type == "WS":
213 depth
= len(token
.value
)
215 # WS tokens are never passed to the parser
218 if token
.type == "NEWLINE":
220 if prev_was_ws
or token
.at_line_start
:
223 # pass the other cases on through
227 # then it must be a real token (not WS, not NEWLINE)
228 # which can affect the indentation level
231 if token
.must_indent
:
232 # The current depth must be larger than the previous level
233 if not (depth
> levels
[-1]):
234 raise IndentationError("expected an indented block")
237 yield INDENT(token
.lineno
)
239 elif token
.at_line_start
:
240 # Must be on the same level or one of the previous levels
241 if depth
== levels
[-1]:
244 elif depth
> levels
[-1]:
245 raise IndentationError("indent increase but not in new block")
247 # Back up; but only if it matches a previous level
249 i
= levels
.index(depth
)
251 raise IndentationError("inconsistent indentation")
252 for _
in range(i
+1, len(levels
)):
253 yield DEDENT(token
.lineno
)
258 ### Finished processing ###
260 # Must dedent any remaining levels
262 assert token
is not None
263 for _
in range(1, len(levels
)):
264 yield DEDENT(token
.lineno
)
267 # The top-level filter adds an ENDMARKER, if requested.
268 # Python's grammar uses it.
269 def filter(lexer
, add_endmarker
=True):
271 tokens
= iter(lexer
.token
, None)
272 tokens
= python_colonify(lexer
, tokens
)
273 tokens
= track_tokens_filter(lexer
, tokens
)
274 for token
in indentation_filter(tokens
):
279 if token
is not None:
280 lineno
= token
.lineno
281 yield _new_token("ENDMARKER", lineno
)
299 'NUMBER', # Python decimals
300 'BINARY', # Python binary
301 'STRING', # single quoted strings only; syntax of raw strings
341 def build(self
, **kwargs
):
342 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
346 r
"""0x[0-9a-fA-F_]+"""
347 val
= t
.value
.replace("_", "")
348 t
.value
= SelectableInt(int(val
, 16), (len(val
)-2)*4) # hex = nibble
351 def t_BINARY(self
, t
):
353 t
.value
= SelectableInt(int(t
.value
, 2), len(t
.value
)-2)
357 # taken from decmial.py but without the leading sign
358 def t_NUMBER(self
, t
):
359 r
"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
360 t
.value
= int(t
.value
)
363 def t_STRING(self
, t
):
364 r
"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
366 t
.value
= t
.value
[1:-1]
371 t_ASSIGNEA
= r
'<-iea'
393 # Ply nicely documented how to do this.
408 "default": "DEFAULT",
412 r
'[a-zA-Z_][a-zA-Z0-9_]*'
413 t
.type = self
.RESERVED
.get(t
.value
, "NAME")
416 # Putting this before t_WS let it consume lines with only comments in
417 # them so the latter code never sees the WS part. Not consuming the
418 # newline. Needed for "if 1: #comment"
419 def t_comment(self
, t
):
420 r
"[ ]*\043[^\n]*" # \043 is '#'
427 if t
.lexer
.at_line_start
and t
.lexer
.paren_count
== 0 and \
428 t
.lexer
.brack_count
== 0:
431 # Don't generate newline tokens when inside of parenthesis, eg
434 def t_newline(self
, t
):
436 t
.lexer
.lineno
+= len(t
.value
)
438 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
441 def t_LBRACK(self
, t
):
443 t
.lexer
.brack_count
+= 1
446 def t_RBRACK(self
, t
):
448 # check for underflow? should be the job of the parser
449 t
.lexer
.brack_count
-= 1
454 t
.lexer
.paren_count
+= 1
459 # check for underflow? should be the job of the parser
460 t
.lexer
.paren_count
-= 1
465 def t_error(self
, t
):
466 raise_syntax_error("Unknown symbol %r" % (t
.value
[0],),
467 self
.filename
, t
.lexer
.lineno
,
468 t
.lexer
.lexpos
, t
.lexer
.lexdata
)
469 print("Skipping", repr(t
.value
[0]))
473 # Combine Ply and my filters into a new lexer
475 class IndentLexer(PowerLexer
):
476 def __init__(self
, debug
=0, optimize
=0, lextab
='lextab', reflags
=0):
478 self
.build(debug
=debug
, optimize
=optimize
,
479 lextab
=lextab
, reflags
=reflags
)
480 self
.token_stream
= None
482 def input(self
, s
, add_endmarker
=True):
483 s
= annoying_case_hack_filter(s
)
487 self
.lexer
.paren_count
= 0
488 self
.lexer
.brack_count
= 0
489 self
.lexer
.lineno
= 1
491 self
.token_stream
= filter(self
.lexer
, add_endmarker
)
495 return next(self
.token_stream
)
496 except StopIteration:
518 if (RS)[63-n] = 0b1 then
525 if __name__
== '__main__':
532 lexer
= IndentLexer(debug
=1)
533 # Give the lexer some input
538 tokens
= iter(lexer
.token
, None)