add <-iea operator
[soc.git] / src / soc / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from soc.decoder.selectable_int import SelectableInt
14
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
16
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
19
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
25
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
30 NO_INDENT = 0
31 MAY_INDENT = 1
32 MUST_INDENT = 2
33
34 # turn into python-like colon syntax from pseudo-code syntax
35 def python_colonify(lexer, tokens):
36
37 forwhile_seen = False
38 for token in tokens:
39 #print ("track colon token", token, token.type)
40
41 if token.type == 'THEN':
42 # turn then into colon
43 token.type = "COLON"
44 yield token
45 elif token.type == 'ELSE':
46 yield token
47 token = copy(token)
48 token.type = "COLON"
49 yield token
50 elif token.type in ['DO', 'WHILE', 'FOR']:
51 forwhile_seen = True
52 yield token
53 elif token.type == 'NEWLINE':
54 if forwhile_seen:
55 ctok = copy(token)
56 ctok.type = "COLON"
57 yield ctok
58 forwhile_seen = False
59 yield token
60 else:
61 yield token
62
63
64 # only care about whitespace at the start of a line
65 def track_tokens_filter(lexer, tokens):
66 oldignore = lexer.lexignore
67 lexer.at_line_start = at_line_start = True
68 indent = NO_INDENT
69 saw_colon = False
70 for token in tokens:
71 #print ("track token", token, token.type)
72 token.at_line_start = at_line_start
73
74 if token.type == "COLON":
75 at_line_start = False
76 indent = MAY_INDENT
77 token.must_indent = False
78
79 elif token.type == "NEWLINE":
80 at_line_start = True
81 if indent == MAY_INDENT:
82 indent = MUST_INDENT
83 token.must_indent = False
84
85 elif token.type == "WS":
86 assert token.at_line_start == True
87 at_line_start = True
88 token.must_indent = False
89
90 else:
91 # A real token; only indent after COLON NEWLINE
92 if indent == MUST_INDENT:
93 token.must_indent = True
94 else:
95 token.must_indent = False
96 at_line_start = False
97 indent = NO_INDENT
98
99 # really bad hack that changes ignore lexer state.
100 # when "must indent" is seen (basically "real tokens" seen)
101 # then ignore whitespace.
102 if token.must_indent:
103 lexer.lexignore = ('ignore', ' ')
104 else:
105 lexer.lexignore = oldignore
106
107 token.indent = indent
108 yield token
109 lexer.at_line_start = at_line_start
110
111 def _new_token(type, lineno):
112 tok = lex.LexToken()
113 tok.type = type
114 tok.value = None
115 tok.lineno = lineno
116 tok.lexpos = -1
117 return tok
118
119 # Synthesize a DEDENT tag
120 def DEDENT(lineno):
121 return _new_token("DEDENT", lineno)
122
123 # Synthesize an INDENT tag
124 def INDENT(lineno):
125 return _new_token("INDENT", lineno)
126
127
128 # Track the indentation level and emit the right INDENT / DEDENT events.
129 def indentation_filter(tokens):
130 # A stack of indentation levels; will never pop item 0
131 levels = [0]
132 token = None
133 depth = 0
134 prev_was_ws = False
135 for token in tokens:
136 if 0:
137 print ("Process", depth, token.indent, token,)
138 if token.at_line_start:
139 print ("at_line_start",)
140 if token.must_indent:
141 print ("must_indent",)
142 print
143
144 # WS only occurs at the start of the line
145 # There may be WS followed by NEWLINE so
146 # only track the depth here. Don't indent/dedent
147 # until there's something real.
148 if token.type == "WS":
149 assert depth == 0
150 depth = len(token.value)
151 prev_was_ws = True
152 # WS tokens are never passed to the parser
153 continue
154
155 if token.type == "NEWLINE":
156 depth = 0
157 if prev_was_ws or token.at_line_start:
158 # ignore blank lines
159 continue
160 # pass the other cases on through
161 yield token
162 continue
163
164 # then it must be a real token (not WS, not NEWLINE)
165 # which can affect the indentation level
166
167 prev_was_ws = False
168 if token.must_indent:
169 # The current depth must be larger than the previous level
170 if not (depth > levels[-1]):
171 raise IndentationError("expected an indented block")
172
173 levels.append(depth)
174 yield INDENT(token.lineno)
175
176 elif token.at_line_start:
177 # Must be on the same level or one of the previous levels
178 if depth == levels[-1]:
179 # At the same level
180 pass
181 elif depth > levels[-1]:
182 raise IndentationError("indent increase but not in new block")
183 else:
184 # Back up; but only if it matches a previous level
185 try:
186 i = levels.index(depth)
187 except ValueError:
188 raise IndentationError("inconsistent indentation")
189 for _ in range(i+1, len(levels)):
190 yield DEDENT(token.lineno)
191 levels.pop()
192
193 yield token
194
195 ### Finished processing ###
196
197 # Must dedent any remaining levels
198 if len(levels) > 1:
199 assert token is not None
200 for _ in range(1, len(levels)):
201 yield DEDENT(token.lineno)
202
203
204 # The top-level filter adds an ENDMARKER, if requested.
205 # Python's grammar uses it.
206 def filter(lexer, add_endmarker = True):
207 token = None
208 tokens = iter(lexer.token, None)
209 tokens = python_colonify(lexer, tokens)
210 tokens = track_tokens_filter(lexer, tokens)
211 for token in indentation_filter(tokens):
212 yield token
213
214 if add_endmarker:
215 lineno = 1
216 if token is not None:
217 lineno = token.lineno
218 yield _new_token("ENDMARKER", lineno)
219
220 ##### Lexer ######
221
222 class PowerLexer:
223 tokens = (
224 'DEF',
225 'IF',
226 'THEN',
227 'ELSE',
228 'FOR',
229 'TO',
230 'DO',
231 'WHILE',
232 'BREAK',
233 'NAME',
234 'NUMBER', # Python decimals
235 'BINARY', # Python binary
236 'STRING', # single quoted strings only; syntax of raw strings
237 'LPAR',
238 'RPAR',
239 'LBRACK',
240 'RBRACK',
241 'COLON',
242 'EQ',
243 'ASSIGNEA',
244 'ASSIGN',
245 'LTU',
246 'GTU',
247 'LE',
248 'GE',
249 'LT',
250 'GT',
251 'PLUS',
252 'MINUS',
253 'MULT',
254 'DIV',
255 'MOD',
256 'INVERT',
257 'APPEND',
258 'BITOR',
259 'BITAND',
260 'BITXOR',
261 'RETURN',
262 'WS',
263 'NEWLINE',
264 'COMMA',
265 'SEMICOLON',
266 'INDENT',
267 'DEDENT',
268 'ENDMARKER',
269 )
270
271 # Build the lexer
272 def build(self,**kwargs):
273 self.lexer = lex.lex(module=self, **kwargs)
274
275 def t_BINARY(self, t):
276 r"""0b[01]+"""
277 t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
278 return t
279
280 #t_NUMBER = r'\d+'
281 # taken from decmial.py but without the leading sign
282 def t_NUMBER(self, t):
283 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
284 t.value = int(t.value)
285 return t
286
287 def t_STRING(self, t):
288 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
289 print (repr(t.value))
290 t.value=t.value[1:-1]
291 return t
292
293 t_COLON = r':'
294 t_EQ = r'='
295 t_ASSIGNEA = r'<-iea'
296 t_ASSIGN = r'<-'
297 t_LTU = r'<u'
298 t_GTU = r'>u'
299 t_LE = r'<='
300 t_GE = r'>='
301 t_LT = r'<'
302 t_GT = r'>'
303 t_PLUS = r'\+'
304 t_MINUS = r'-'
305 t_MULT = r'\*'
306 t_DIV = r'/'
307 t_MOD = r'%'
308 t_INVERT = r'¬'
309 t_COMMA = r','
310 t_SEMICOLON = r';'
311 t_APPEND = r'\|\|'
312 t_BITOR = r'\|'
313 t_BITAND = r'\&'
314 t_BITXOR = r'\^'
315
316 # Ply nicely documented how to do this.
317
318 RESERVED = {
319 "def": "DEF",
320 "if": "IF",
321 "then": "THEN",
322 "else": "ELSE",
323 "leave": "BREAK",
324 "for": "FOR",
325 "to": "TO",
326 "while": "WHILE",
327 "do": "DO",
328 "return": "RETURN",
329 }
330
331 def t_NAME(self, t):
332 r'[a-zA-Z_][a-zA-Z0-9_]*'
333 t.type = self.RESERVED.get(t.value, "NAME")
334 return t
335
336 # Putting this before t_WS let it consume lines with only comments in
337 # them so the latter code never sees the WS part. Not consuming the
338 # newline. Needed for "if 1: #comment"
339 def t_comment(self, t):
340 r"[ ]*\043[^\n]*" # \043 is '#'
341 pass
342
343
344 # Whitespace
345 def t_WS(self, t):
346 r'[ ]+'
347 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
348 t.lexer.brack_count == 0:
349 return t
350
351 # Don't generate newline tokens when inside of parenthesis, eg
352 # a = (1,
353 # 2, 3)
354 def t_newline(self, t):
355 r'\n+'
356 t.lexer.lineno += len(t.value)
357 t.type = "NEWLINE"
358 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
359 return t
360
361 def t_LBRACK(self, t):
362 r'\['
363 t.lexer.brack_count += 1
364 return t
365
366 def t_RBRACK(self, t):
367 r'\]'
368 # check for underflow? should be the job of the parser
369 t.lexer.brack_count -= 1
370 return t
371
372 def t_LPAR(self, t):
373 r'\('
374 t.lexer.paren_count += 1
375 return t
376
377 def t_RPAR(self, t):
378 r'\)'
379 # check for underflow? should be the job of the parser
380 t.lexer.paren_count -= 1
381 return t
382
383 #t_ignore = " "
384
385 def t_error(self, t):
386 raise SyntaxError("Unknown symbol %r" % (t.value[0],))
387 print ("Skipping", repr(t.value[0]))
388 t.lexer.skip(1)
389
390
391 # Combine Ply and my filters into a new lexer
392
393 class IndentLexer(PowerLexer):
394 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
395 self.build(debug=debug, optimize=optimize,
396 lextab=lextab, reflags=reflags)
397 self.token_stream = None
398
399 def input(self, s, add_endmarker=True):
400 self.lexer.paren_count = 0
401 self.lexer.brack_count = 0
402 self.lexer.input(s)
403 self.token_stream = filter(self.lexer, add_endmarker)
404
405 def token(self):
406 try:
407 return next(self.token_stream)
408 except StopIteration:
409 return None
410
411 if __name__ == '__main__':
412
413 # quick test/demo
414 cnttzd = """
415 n <- 0
416 do while n < 64
417 if (RS)[63-n] = 0b1 then
418 leave
419 n <- n + 1
420 RA <- EXTZ64(n)
421 print (RA)
422 """
423
424 code = cnttzd
425
426 lexer = IndentLexer(debug=1)
427 # Give the lexer some input
428 print ("code")
429 print (code)
430 lexer.input(code)
431