add annoying case-hack filter
[soc.git] / src / soc / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from soc.decoder.selectable_int import SelectableInt
14
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
16
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
19
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
25
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
30 NO_INDENT = 0
31 MAY_INDENT = 1
32 MUST_INDENT = 2
33
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38 def python_colonify(lexer, tokens):
39
40 implied_colon_needed = False
41 for token in tokens:
42 #print ("track colon token", token, token.type)
43
44 if token.type == 'THEN':
45 # turn then into colon
46 token.type = "COLON"
47 yield token
48 elif token.type == 'ELSE':
49 yield token
50 token = copy(token)
51 token.type = "COLON"
52 yield token
53 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
54 implied_colon_needed = True
55 yield token
56 elif token.type == 'NEWLINE':
57 if implied_colon_needed:
58 ctok = copy(token)
59 ctok.type = "COLON"
60 yield ctok
61 implied_colon_needed = False
62 yield token
63 else:
64 yield token
65
66
67 # only care about whitespace at the start of a line
68 def track_tokens_filter(lexer, tokens):
69 oldignore = lexer.lexignore
70 lexer.at_line_start = at_line_start = True
71 indent = NO_INDENT
72 saw_colon = False
73 for token in tokens:
74 #print ("track token", token, token.type)
75 token.at_line_start = at_line_start
76
77 if token.type == "COLON":
78 at_line_start = False
79 indent = MAY_INDENT
80 token.must_indent = False
81
82 elif token.type == "NEWLINE":
83 at_line_start = True
84 if indent == MAY_INDENT:
85 indent = MUST_INDENT
86 token.must_indent = False
87
88 elif token.type == "WS":
89 assert token.at_line_start == True
90 at_line_start = True
91 token.must_indent = False
92
93 else:
94 # A real token; only indent after COLON NEWLINE
95 if indent == MUST_INDENT:
96 token.must_indent = True
97 else:
98 token.must_indent = False
99 at_line_start = False
100 indent = NO_INDENT
101
102 # really bad hack that changes ignore lexer state.
103 # when "must indent" is seen (basically "real tokens" seen)
104 # then ignore whitespace.
105 if token.must_indent:
106 lexer.lexignore = ('ignore', ' ')
107 else:
108 lexer.lexignore = oldignore
109
110 token.indent = indent
111 yield token
112 lexer.at_line_start = at_line_start
113
114 def _new_token(type, lineno):
115 tok = lex.LexToken()
116 tok.type = type
117 tok.value = None
118 tok.lineno = lineno
119 tok.lexpos = -1
120 return tok
121
122 # Synthesize a DEDENT tag
123 def DEDENT(lineno):
124 return _new_token("DEDENT", lineno)
125
126 # Synthesize an INDENT tag
127 def INDENT(lineno):
128 return _new_token("INDENT", lineno)
129
130 def count_spaces(l):
131 for i in range(len(l)):
132 if l[i] != ' ':
133 return i
134 return 0
135
136 def annoying_case_hack_filter(code):
137 """add annoying "silent keyword" (fallthrough)
138
139 this which tricks the parser into taking the (silent) case statement
140 as a "small expression". it can then be spotted and used to indicate
141 "fall through" to the next case (in the parser)
142
143 bugs: any function that starts with the letters "case" or "default"
144 will be detected erroneously. fixing that involves doing a token
145 lexer which spots the fact that "case" and "default" are words,
146 separating them from space, colon, bracket etc.
147
148 http://bugs.libre-riscv.org/show_bug.cgi?id=280
149 """
150 res = []
151 prev_spc_count = None
152 for l in code.split("\n"):
153 spc_count = count_spaces(l)
154 nwhite = l[spc_count:]
155 if nwhite.startswith("case") or nwhite.startswith("default"):
156 #print ("case/default", nwhite, spc_count, prev_spc_count)
157 if (prev_spc_count is not None and
158 prev_spc_count == spc_count and
159 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
160 res[-1] += " fallthrough" # add to previous line
161 prev_spc_count = spc_count
162 else:
163 #print ("notstarts", spc_count, nwhite)
164 prev_spc_count = None
165 res.append(l)
166 return '\n'.join(res)
167
168
169 # Track the indentation level and emit the right INDENT / DEDENT events.
170 def indentation_filter(tokens):
171 # A stack of indentation levels; will never pop item 0
172 levels = [0]
173 token = None
174 depth = 0
175 prev_was_ws = False
176 for token in tokens:
177 if 0:
178 print ("Process", depth, token.indent, token,)
179 if token.at_line_start:
180 print ("at_line_start",)
181 if token.must_indent:
182 print ("must_indent",)
183 print
184
185 # WS only occurs at the start of the line
186 # There may be WS followed by NEWLINE so
187 # only track the depth here. Don't indent/dedent
188 # until there's something real.
189 if token.type == "WS":
190 assert depth == 0
191 depth = len(token.value)
192 prev_was_ws = True
193 # WS tokens are never passed to the parser
194 continue
195
196 if token.type == "NEWLINE":
197 depth = 0
198 if prev_was_ws or token.at_line_start:
199 # ignore blank lines
200 continue
201 # pass the other cases on through
202 yield token
203 continue
204
205 # then it must be a real token (not WS, not NEWLINE)
206 # which can affect the indentation level
207
208 prev_was_ws = False
209 if token.must_indent:
210 # The current depth must be larger than the previous level
211 if not (depth > levels[-1]):
212 raise IndentationError("expected an indented block")
213
214 levels.append(depth)
215 yield INDENT(token.lineno)
216
217 elif token.at_line_start:
218 # Must be on the same level or one of the previous levels
219 if depth == levels[-1]:
220 # At the same level
221 pass
222 elif depth > levels[-1]:
223 raise IndentationError("indent increase but not in new block")
224 else:
225 # Back up; but only if it matches a previous level
226 try:
227 i = levels.index(depth)
228 except ValueError:
229 raise IndentationError("inconsistent indentation")
230 for _ in range(i+1, len(levels)):
231 yield DEDENT(token.lineno)
232 levels.pop()
233
234 yield token
235
236 ### Finished processing ###
237
238 # Must dedent any remaining levels
239 if len(levels) > 1:
240 assert token is not None
241 for _ in range(1, len(levels)):
242 yield DEDENT(token.lineno)
243
244
245 # The top-level filter adds an ENDMARKER, if requested.
246 # Python's grammar uses it.
247 def filter(lexer, add_endmarker = True):
248 token = None
249 tokens = iter(lexer.token, None)
250 tokens = python_colonify(lexer, tokens)
251 tokens = track_tokens_filter(lexer, tokens)
252 for token in indentation_filter(tokens):
253 yield token
254
255 if add_endmarker:
256 lineno = 1
257 if token is not None:
258 lineno = token.lineno
259 yield _new_token("ENDMARKER", lineno)
260
261 ##### Lexer ######
262
263 class PowerLexer:
264 tokens = (
265 'DEF',
266 'IF',
267 'THEN',
268 'ELSE',
269 'FOR',
270 'TO',
271 'DO',
272 'WHILE',
273 'BREAK',
274 'NAME',
275 'HEX', # hex numbers
276 'NUMBER', # Python decimals
277 'BINARY', # Python binary
278 'STRING', # single quoted strings only; syntax of raw strings
279 'LPAR',
280 'RPAR',
281 'LBRACK',
282 'RBRACK',
283 'COLON',
284 'EQ',
285 'ASSIGNEA',
286 'ASSIGN',
287 'LTU',
288 'GTU',
289 'NE',
290 'LE',
291 'GE',
292 'LT',
293 'GT',
294 'PLUS',
295 'MINUS',
296 'MULT',
297 'DIV',
298 'MOD',
299 'INVERT',
300 'APPEND',
301 'BITOR',
302 'BITAND',
303 'BITXOR',
304 'RETURN',
305 'SWITCH',
306 'CASE',
307 'DEFAULT',
308 'WS',
309 'NEWLINE',
310 'COMMA',
311 'SEMICOLON',
312 'INDENT',
313 'DEDENT',
314 'ENDMARKER',
315 )
316
317 # Build the lexer
318 def build(self,**kwargs):
319 self.lexer = lex.lex(module=self, **kwargs)
320
321 def t_HEX(self, t):
322 r"""0x[0-9a-fA-F_]+"""
323 val = t.value.replace("_", "")
324 t.value = SelectableInt(int(val, 16), (len(val)-2)*16)
325 return t
326
327 def t_BINARY(self, t):
328 r"""0b[01]+"""
329 t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
330 return t
331
332 #t_NUMBER = r'\d+'
333 # taken from decmial.py but without the leading sign
334 def t_NUMBER(self, t):
335 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
336 t.value = int(t.value)
337 return t
338
339 def t_STRING(self, t):
340 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
341 print (repr(t.value))
342 t.value=t.value[1:-1]
343 return t
344
345 t_COLON = r':'
346 t_EQ = r'='
347 t_ASSIGNEA = r'<-iea'
348 t_ASSIGN = r'<-'
349 t_LTU = r'<u'
350 t_GTU = r'>u'
351 t_NE = r'!='
352 t_LE = r'<='
353 t_GE = r'>='
354 t_LT = r'<'
355 t_GT = r'>'
356 t_PLUS = r'\+'
357 t_MINUS = r'-'
358 t_MULT = r'\*'
359 t_DIV = r'/'
360 t_MOD = r'%'
361 t_INVERT = r'¬'
362 t_COMMA = r','
363 t_SEMICOLON = r';'
364 t_APPEND = r'\|\|'
365 t_BITOR = r'\|'
366 t_BITAND = r'\&'
367 t_BITXOR = r'\^'
368
369 # Ply nicely documented how to do this.
370
371 RESERVED = {
372 "def": "DEF",
373 "if": "IF",
374 "then": "THEN",
375 "else": "ELSE",
376 "leave": "BREAK",
377 "for": "FOR",
378 "to": "TO",
379 "while": "WHILE",
380 "do": "DO",
381 "return": "RETURN",
382 "switch": "SWITCH",
383 "case": "CASE",
384 "default": "DEFAULT",
385 }
386
387 def t_NAME(self, t):
388 r'[a-zA-Z_][a-zA-Z0-9_]*'
389 t.type = self.RESERVED.get(t.value, "NAME")
390 return t
391
392 # Putting this before t_WS let it consume lines with only comments in
393 # them so the latter code never sees the WS part. Not consuming the
394 # newline. Needed for "if 1: #comment"
395 def t_comment(self, t):
396 r"[ ]*\043[^\n]*" # \043 is '#'
397 pass
398
399
400 # Whitespace
401 def t_WS(self, t):
402 r'[ ]+'
403 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
404 t.lexer.brack_count == 0:
405 return t
406
407 # Don't generate newline tokens when inside of parenthesis, eg
408 # a = (1,
409 # 2, 3)
410 def t_newline(self, t):
411 r'\n+'
412 t.lexer.lineno += len(t.value)
413 t.type = "NEWLINE"
414 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
415 return t
416
417 def t_LBRACK(self, t):
418 r'\['
419 t.lexer.brack_count += 1
420 return t
421
422 def t_RBRACK(self, t):
423 r'\]'
424 # check for underflow? should be the job of the parser
425 t.lexer.brack_count -= 1
426 return t
427
428 def t_LPAR(self, t):
429 r'\('
430 t.lexer.paren_count += 1
431 return t
432
433 def t_RPAR(self, t):
434 r'\)'
435 # check for underflow? should be the job of the parser
436 t.lexer.paren_count -= 1
437 return t
438
439 #t_ignore = " "
440
441 def t_error(self, t):
442 raise SyntaxError("Unknown symbol %r" % (t.value[0],))
443 print ("Skipping", repr(t.value[0]))
444 t.lexer.skip(1)
445
446
447 # Combine Ply and my filters into a new lexer
448
449 class IndentLexer(PowerLexer):
450 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
451 self.debug = debug
452 self.build(debug=debug, optimize=optimize,
453 lextab=lextab, reflags=reflags)
454 self.token_stream = None
455
456 def input(self, s, add_endmarker=True):
457 s = annoying_case_hack_filter(s)
458 if self.debug:
459 print (s)
460 self.lexer.paren_count = 0
461 self.lexer.brack_count = 0
462 self.lexer.input(s)
463 self.token_stream = filter(self.lexer, add_endmarker)
464
465 def token(self):
466 try:
467 return next(self.token_stream)
468 except StopIteration:
469 return None
470
471 switchtest = """
472 switch (n)
473 case(1): x <- 5
474 case(3): x <- 2
475 case(2):
476 case(4):
477 x <- 3
478 case(9):
479 default:
480 x <- 9
481 print (5)
482 """
483
484 cnttzd = """
485 n <- 0
486 do while n < 64
487 if (RS)[63-n] = 0b1 then
488 leave
489 n <- n + 1
490 RA <- EXTZ64(n)
491 print (RA)
492 """
493
494 if __name__ == '__main__':
495
496 # quick test/demo
497 #code = cnttzd
498 code = switchtest
499 print (code)
500
501 lexer = IndentLexer(debug=1)
502 # Give the lexer some input
503 print ("code")
504 print (code)
505 lexer.input(code)
506
507 tokens = iter(lexer.token, None)
508 for token in tokens:
509 print (token)
510