1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
10 # Modifications for inclusion in PLY distribution
13 from soc
.decoder
.selectable_int
import SelectableInt
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
34 # turn into python-like colon syntax from pseudo-code syntax
35 def python_colonify(lexer
, tokens
):
39 #print ("track colon token", token, token.type)
41 if token
.type == 'THEN':
42 # turn then into colon
45 elif token
.type == 'ELSE':
50 elif token
.type in ['WHILE', 'FOR']:
53 elif token
.type == 'NEWLINE':
64 # only care about whitespace at the start of a line
65 def track_tokens_filter(lexer
, tokens
):
66 oldignore
= lexer
.lexignore
67 lexer
.at_line_start
= at_line_start
= True
71 #print ("track token", token, token.type)
72 token
.at_line_start
= at_line_start
74 if token
.type == "COLON":
77 token
.must_indent
= False
79 elif token
.type == "NEWLINE":
81 if indent
== MAY_INDENT
:
83 token
.must_indent
= False
85 elif token
.type == "WS":
86 assert token
.at_line_start
== True
88 token
.must_indent
= False
91 # A real token; only indent after COLON NEWLINE
92 if indent
== MUST_INDENT
:
93 token
.must_indent
= True
95 token
.must_indent
= False
99 # really bad hack that changes ignore lexer state.
100 # when "must indent" is seen (basically "real tokens" seen)
101 # then ignore whitespace.
102 if token
.must_indent
:
103 lexer
.lexignore
= ('ignore', ' ')
105 lexer
.lexignore
= oldignore
107 token
.indent
= indent
109 lexer
.at_line_start
= at_line_start
111 def _new_token(type, lineno
):
119 # Synthesize a DEDENT tag
121 return _new_token("DEDENT", lineno
)
123 # Synthesize an INDENT tag
125 return _new_token("INDENT", lineno
)
128 # Track the indentation level and emit the right INDENT / DEDENT events.
129 def indentation_filter(tokens
):
130 # A stack of indentation levels; will never pop item 0
137 print ("Process", depth
, token
.indent
, token
,)
138 if token
.at_line_start
:
139 print ("at_line_start",)
140 if token
.must_indent
:
141 print ("must_indent",)
144 # WS only occurs at the start of the line
145 # There may be WS followed by NEWLINE so
146 # only track the depth here. Don't indent/dedent
147 # until there's something real.
148 if token
.type == "WS":
150 depth
= len(token
.value
)
152 # WS tokens are never passed to the parser
155 if token
.type == "NEWLINE":
157 if prev_was_ws
or token
.at_line_start
:
160 # pass the other cases on through
164 # then it must be a real token (not WS, not NEWLINE)
165 # which can affect the indentation level
168 if token
.must_indent
:
169 # The current depth must be larger than the previous level
170 if not (depth
> levels
[-1]):
171 raise IndentationError("expected an indented block")
174 yield INDENT(token
.lineno
)
176 elif token
.at_line_start
:
177 # Must be on the same level or one of the previous levels
178 if depth
== levels
[-1]:
181 elif depth
> levels
[-1]:
182 raise IndentationError("indent increase but not in new block")
184 # Back up; but only if it matches a previous level
186 i
= levels
.index(depth
)
188 raise IndentationError("inconsistent indentation")
189 for _
in range(i
+1, len(levels
)):
190 yield DEDENT(token
.lineno
)
195 ### Finished processing ###
197 # Must dedent any remaining levels
199 assert token
is not None
200 for _
in range(1, len(levels
)):
201 yield DEDENT(token
.lineno
)
204 # The top-level filter adds an ENDMARKER, if requested.
205 # Python's grammar uses it.
206 def filter(lexer
, add_endmarker
= True):
208 tokens
= iter(lexer
.token
, None)
209 tokens
= python_colonify(lexer
, tokens
)
210 tokens
= track_tokens_filter(lexer
, tokens
)
211 for token
in indentation_filter(tokens
):
216 if token
is not None:
217 lineno
= token
.lineno
218 yield _new_token("ENDMARKER", lineno
)
234 'NUMBER', # Python decimals
235 'BINARY', # Python binary
236 'STRING', # single quoted strings only; syntax of raw strings
271 def build(self
,**kwargs
):
272 self
.lexer
= lex
.lex(module
=self
, **kwargs
)
274 def t_BINARY(self
, t
):
276 t
.value
= SelectableInt(int(t
.value
, 2), len(t
.value
)-2)
280 # taken from decmial.py but without the leading sign
281 def t_NUMBER(self
, t
):
282 r
"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
283 t
.value
= int(t
.value
)
286 def t_STRING(self
, t
):
287 r
"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
288 print (repr(t
.value
))
289 t
.value
=t
.value
[1:-1]
314 # Ply nicely documented how to do this.
330 r
'[a-zA-Z_][a-zA-Z0-9_]*'
331 t
.type = self
.RESERVED
.get(t
.value
, "NAME")
334 # Putting this before t_WS let it consume lines with only comments in
335 # them so the latter code never sees the WS part. Not consuming the
336 # newline. Needed for "if 1: #comment"
337 def t_comment(self
, t
):
338 r
"[ ]*\043[^\n]*" # \043 is '#'
345 if t
.lexer
.at_line_start
and t
.lexer
.paren_count
== 0 and \
346 t
.lexer
.brack_count
== 0:
349 # Don't generate newline tokens when inside of parenthesis, eg
352 def t_newline(self
, t
):
354 t
.lexer
.lineno
+= len(t
.value
)
356 if t
.lexer
.paren_count
== 0 and t
.lexer
.brack_count
== 0:
359 def t_LBRACK(self
, t
):
361 t
.lexer
.brack_count
+= 1
364 def t_RBRACK(self
, t
):
366 # check for underflow? should be the job of the parser
367 t
.lexer
.brack_count
-= 1
372 t
.lexer
.paren_count
+= 1
377 # check for underflow? should be the job of the parser
378 t
.lexer
.paren_count
-= 1
383 def t_error(self
, t
):
384 raise SyntaxError("Unknown symbol %r" % (t
.value
[0],))
385 print ("Skipping", repr(t
.value
[0]))
389 # Combine Ply and my filters into a new lexer
391 class IndentLexer(PowerLexer
):
392 def __init__(self
, debug
=0, optimize
=0, lextab
='lextab', reflags
=0):
393 self
.build(debug
=debug
, optimize
=optimize
,
394 lextab
=lextab
, reflags
=reflags
)
395 self
.token_stream
= None
397 def input(self
, s
, add_endmarker
=True):
398 self
.lexer
.paren_count
= 0
399 self
.lexer
.brack_count
= 0
401 self
.token_stream
= filter(self
.lexer
, add_endmarker
)
405 return next(self
.token_stream
)
406 except StopIteration:
409 if __name__
== '__main__':
415 if (RS)[63-n] = 0b1 then
424 lexer
= IndentLexer(debug
=1)
425 # Give the lexer some input