format code
[soc.git] / src / soc / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from soc.decoder.selectable_int import SelectableInt
14
15 # I implemented INDENT / DEDENT generation as a post-processing filter
16
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
19
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
25
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
30 NO_INDENT = 0
31 MAY_INDENT = 1
32 MUST_INDENT = 2
33
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38
39
40 def python_colonify(lexer, tokens):
41
42 implied_colon_needed = False
43 for token in tokens:
44 #print ("track colon token", token, token.type)
45
46 if token.type == 'THEN':
47 # turn then into colon
48 token.type = "COLON"
49 yield token
50 elif token.type == 'ELSE':
51 yield token
52 token = copy(token)
53 token.type = "COLON"
54 yield token
55 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
56 implied_colon_needed = True
57 yield token
58 elif token.type == 'NEWLINE':
59 if implied_colon_needed:
60 ctok = copy(token)
61 ctok.type = "COLON"
62 yield ctok
63 implied_colon_needed = False
64 yield token
65 else:
66 yield token
67
68
69 # only care about whitespace at the start of a line
70 def track_tokens_filter(lexer, tokens):
71 oldignore = lexer.lexignore
72 lexer.at_line_start = at_line_start = True
73 indent = NO_INDENT
74 saw_colon = False
75 for token in tokens:
76 #print ("track token", token, token.type)
77 token.at_line_start = at_line_start
78
79 if token.type == "COLON":
80 at_line_start = False
81 indent = MAY_INDENT
82 token.must_indent = False
83
84 elif token.type == "NEWLINE":
85 at_line_start = True
86 if indent == MAY_INDENT:
87 indent = MUST_INDENT
88 token.must_indent = False
89
90 elif token.type == "WS":
91 assert token.at_line_start == True
92 at_line_start = True
93 token.must_indent = False
94
95 else:
96 # A real token; only indent after COLON NEWLINE
97 if indent == MUST_INDENT:
98 token.must_indent = True
99 else:
100 token.must_indent = False
101 at_line_start = False
102 indent = NO_INDENT
103
104 # really bad hack that changes ignore lexer state.
105 # when "must indent" is seen (basically "real tokens" seen)
106 # then ignore whitespace.
107 if token.must_indent:
108 lexer.lexignore = ('ignore', ' ')
109 else:
110 lexer.lexignore = oldignore
111
112 token.indent = indent
113 yield token
114 lexer.at_line_start = at_line_start
115
116
117 def _new_token(type, lineno):
118 tok = lex.LexToken()
119 tok.type = type
120 tok.value = None
121 tok.lineno = lineno
122 tok.lexpos = -1
123 return tok
124
125 # Synthesize a DEDENT tag
126
127
128 def DEDENT(lineno):
129 return _new_token("DEDENT", lineno)
130
131 # Synthesize an INDENT tag
132
133
134 def INDENT(lineno):
135 return _new_token("INDENT", lineno)
136
137
138 def count_spaces(l):
139 for i in range(len(l)):
140 if l[i] != ' ':
141 return i
142 return 0
143
144
145 def annoying_case_hack_filter(code):
146 """add annoying "silent keyword" (fallthrough)
147
148 this which tricks the parser into taking the (silent) case statement
149 as a "small expression". it can then be spotted and used to indicate
150 "fall through" to the next case (in the parser)
151
152 also skips blank lines
153
154 bugs: any function that starts with the letters "case" or "default"
155 will be detected erroneously. fixing that involves doing a token
156 lexer which spots the fact that "case" and "default" are words,
157 separating them from space, colon, bracket etc.
158
159 http://bugs.libre-riscv.org/show_bug.cgi?id=280
160 """
161 res = []
162 prev_spc_count = None
163 for l in code.split("\n"):
164 spc_count = count_spaces(l)
165 nwhite = l[spc_count:]
166 if len(nwhite) == 0: # skip blank lines
167 continue
168 if nwhite.startswith("case") or nwhite.startswith("default"):
169 #print ("case/default", nwhite, spc_count, prev_spc_count)
170 if (prev_spc_count is not None and
171 prev_spc_count == spc_count and
172 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
173 res[-1] += " fallthrough" # add to previous line
174 prev_spc_count = spc_count
175 else:
176 #print ("notstarts", spc_count, nwhite)
177 prev_spc_count = None
178 res.append(l)
179 return '\n'.join(res)
180
181
182 # Track the indentation level and emit the right INDENT / DEDENT events.
183 def indentation_filter(tokens):
184 # A stack of indentation levels; will never pop item 0
185 levels = [0]
186 token = None
187 depth = 0
188 prev_was_ws = False
189 for token in tokens:
190 if 0:
191 print("Process", depth, token.indent, token,)
192 if token.at_line_start:
193 print("at_line_start",)
194 if token.must_indent:
195 print("must_indent",)
196 print
197
198 # WS only occurs at the start of the line
199 # There may be WS followed by NEWLINE so
200 # only track the depth here. Don't indent/dedent
201 # until there's something real.
202 if token.type == "WS":
203 assert depth == 0
204 depth = len(token.value)
205 prev_was_ws = True
206 # WS tokens are never passed to the parser
207 continue
208
209 if token.type == "NEWLINE":
210 depth = 0
211 if prev_was_ws or token.at_line_start:
212 # ignore blank lines
213 continue
214 # pass the other cases on through
215 yield token
216 continue
217
218 # then it must be a real token (not WS, not NEWLINE)
219 # which can affect the indentation level
220
221 prev_was_ws = False
222 if token.must_indent:
223 # The current depth must be larger than the previous level
224 if not (depth > levels[-1]):
225 raise IndentationError("expected an indented block")
226
227 levels.append(depth)
228 yield INDENT(token.lineno)
229
230 elif token.at_line_start:
231 # Must be on the same level or one of the previous levels
232 if depth == levels[-1]:
233 # At the same level
234 pass
235 elif depth > levels[-1]:
236 raise IndentationError("indent increase but not in new block")
237 else:
238 # Back up; but only if it matches a previous level
239 try:
240 i = levels.index(depth)
241 except ValueError:
242 raise IndentationError("inconsistent indentation")
243 for _ in range(i+1, len(levels)):
244 yield DEDENT(token.lineno)
245 levels.pop()
246
247 yield token
248
249 ### Finished processing ###
250
251 # Must dedent any remaining levels
252 if len(levels) > 1:
253 assert token is not None
254 for _ in range(1, len(levels)):
255 yield DEDENT(token.lineno)
256
257
258 # The top-level filter adds an ENDMARKER, if requested.
259 # Python's grammar uses it.
260 def filter(lexer, add_endmarker=True):
261 token = None
262 tokens = iter(lexer.token, None)
263 tokens = python_colonify(lexer, tokens)
264 tokens = track_tokens_filter(lexer, tokens)
265 for token in indentation_filter(tokens):
266 yield token
267
268 if add_endmarker:
269 lineno = 1
270 if token is not None:
271 lineno = token.lineno
272 yield _new_token("ENDMARKER", lineno)
273
274 ##### Lexer ######
275
276
277 class PowerLexer:
278 tokens = (
279 'DEF',
280 'IF',
281 'THEN',
282 'ELSE',
283 'FOR',
284 'TO',
285 'DO',
286 'WHILE',
287 'BREAK',
288 'NAME',
289 'HEX', # hex numbers
290 'NUMBER', # Python decimals
291 'BINARY', # Python binary
292 'STRING', # single quoted strings only; syntax of raw strings
293 'LPAR',
294 'RPAR',
295 'LBRACK',
296 'RBRACK',
297 'COLON',
298 'EQ',
299 'ASSIGNEA',
300 'ASSIGN',
301 'LTU',
302 'GTU',
303 'NE',
304 'LE',
305 'GE',
306 'LT',
307 'GT',
308 'PLUS',
309 'MINUS',
310 'MULT',
311 'DIV',
312 'MOD',
313 'INVERT',
314 'APPEND',
315 'BITOR',
316 'BITAND',
317 'BITXOR',
318 'RETURN',
319 'SWITCH',
320 'CASE',
321 'DEFAULT',
322 'WS',
323 'NEWLINE',
324 'COMMA',
325 'SEMICOLON',
326 'INDENT',
327 'DEDENT',
328 'ENDMARKER',
329 )
330
331 # Build the lexer
332 def build(self, **kwargs):
333 self.lexer = lex.lex(module=self, **kwargs)
334
335 def t_HEX(self, t):
336 r"""0x[0-9a-fA-F_]+"""
337 val = t.value.replace("_", "")
338 t.value = SelectableInt(int(val, 16), (len(val)-2)*4) # hex = nibble
339 return t
340
341 def t_BINARY(self, t):
342 r"""0b[01]+"""
343 t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
344 return t
345
346 #t_NUMBER = r'\d+'
347 # taken from decmial.py but without the leading sign
348 def t_NUMBER(self, t):
349 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
350 t.value = int(t.value)
351 return t
352
353 def t_STRING(self, t):
354 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
355 print(repr(t.value))
356 t.value = t.value[1:-1]
357 return t
358
359 t_COLON = r':'
360 t_EQ = r'='
361 t_ASSIGNEA = r'<-iea'
362 t_ASSIGN = r'<-'
363 t_LTU = r'<u'
364 t_GTU = r'>u'
365 t_NE = r'!='
366 t_LE = r'<='
367 t_GE = r'>='
368 t_LT = r'<'
369 t_GT = r'>'
370 t_PLUS = r'\+'
371 t_MINUS = r'-'
372 t_MULT = r'\*'
373 t_DIV = r'/'
374 t_MOD = r'%'
375 t_INVERT = r'¬'
376 t_COMMA = r','
377 t_SEMICOLON = r';'
378 t_APPEND = r'\|\|'
379 t_BITOR = r'\|'
380 t_BITAND = r'\&'
381 t_BITXOR = r'\^'
382
383 # Ply nicely documented how to do this.
384
385 RESERVED = {
386 "def": "DEF",
387 "if": "IF",
388 "then": "THEN",
389 "else": "ELSE",
390 "leave": "BREAK",
391 "for": "FOR",
392 "to": "TO",
393 "while": "WHILE",
394 "do": "DO",
395 "return": "RETURN",
396 "switch": "SWITCH",
397 "case": "CASE",
398 "default": "DEFAULT",
399 }
400
401 def t_NAME(self, t):
402 r'[a-zA-Z_][a-zA-Z0-9_]*'
403 t.type = self.RESERVED.get(t.value, "NAME")
404 return t
405
406 # Putting this before t_WS let it consume lines with only comments in
407 # them so the latter code never sees the WS part. Not consuming the
408 # newline. Needed for "if 1: #comment"
409 def t_comment(self, t):
410 r"[ ]*\043[^\n]*" # \043 is '#'
411 pass
412
413 # Whitespace
414
415 def t_WS(self, t):
416 r'[ ]+'
417 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
418 t.lexer.brack_count == 0:
419 return t
420
421 # Don't generate newline tokens when inside of parenthesis, eg
422 # a = (1,
423 # 2, 3)
424 def t_newline(self, t):
425 r'\n+'
426 t.lexer.lineno += len(t.value)
427 t.type = "NEWLINE"
428 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
429 return t
430
431 def t_LBRACK(self, t):
432 r'\['
433 t.lexer.brack_count += 1
434 return t
435
436 def t_RBRACK(self, t):
437 r'\]'
438 # check for underflow? should be the job of the parser
439 t.lexer.brack_count -= 1
440 return t
441
442 def t_LPAR(self, t):
443 r'\('
444 t.lexer.paren_count += 1
445 return t
446
447 def t_RPAR(self, t):
448 r'\)'
449 # check for underflow? should be the job of the parser
450 t.lexer.paren_count -= 1
451 return t
452
453 #t_ignore = " "
454
455 def t_error(self, t):
456 raise SyntaxError("Unknown symbol %r" % (t.value[0],))
457 print("Skipping", repr(t.value[0]))
458 t.lexer.skip(1)
459
460
461 # Combine Ply and my filters into a new lexer
462
463 class IndentLexer(PowerLexer):
464 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
465 self.debug = debug
466 self.build(debug=debug, optimize=optimize,
467 lextab=lextab, reflags=reflags)
468 self.token_stream = None
469
470 def input(self, s, add_endmarker=True):
471 s = annoying_case_hack_filter(s)
472 if self.debug:
473 print(s)
474 s += "\n"
475 self.lexer.paren_count = 0
476 self.lexer.brack_count = 0
477 self.lexer.input(s)
478 self.token_stream = filter(self.lexer, add_endmarker)
479
480 def token(self):
481 try:
482 return next(self.token_stream)
483 except StopIteration:
484 return None
485
486
487 switchtest = """
488 switch (n)
489 case(1): x <- 5
490 case(3): x <- 2
491 case(2):
492
493 case(4):
494 x <- 3
495 case(9):
496
497 default:
498 x <- 9
499 print (5)
500 """
501
502 cnttzd = """
503 n <- 0
504 do while n < 64
505 if (RS)[63-n] = 0b1 then
506 leave
507 n <- n + 1
508 RA <- EXTZ64(n)
509 print (RA)
510 """
511
512 if __name__ == '__main__':
513
514 # quick test/demo
515 #code = cnttzd
516 code = switchtest
517 print(code)
518
519 lexer = IndentLexer(debug=1)
520 # Give the lexer some input
521 print("code")
522 print(code)
523 lexer.input(code)
524
525 tokens = iter(lexer.token, None)
526 for token in tokens:
527 print(token)