add OP_SC
[soc.git] / src / soc / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from soc.decoder.selectable_int import SelectableInt
14
15 ## I implemented INDENT / DEDENT generation as a post-processing filter
16
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
19
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
25
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
30 NO_INDENT = 0
31 MAY_INDENT = 1
32 MUST_INDENT = 2
33
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38 def python_colonify(lexer, tokens):
39
40 implied_colon_needed = False
41 for token in tokens:
42 #print ("track colon token", token, token.type)
43
44 if token.type == 'THEN':
45 # turn then into colon
46 token.type = "COLON"
47 yield token
48 elif token.type == 'ELSE':
49 yield token
50 token = copy(token)
51 token.type = "COLON"
52 yield token
53 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
54 implied_colon_needed = True
55 yield token
56 elif token.type == 'NEWLINE':
57 if implied_colon_needed:
58 ctok = copy(token)
59 ctok.type = "COLON"
60 yield ctok
61 implied_colon_needed = False
62 yield token
63 else:
64 yield token
65
66
67 # only care about whitespace at the start of a line
68 def track_tokens_filter(lexer, tokens):
69 oldignore = lexer.lexignore
70 lexer.at_line_start = at_line_start = True
71 indent = NO_INDENT
72 saw_colon = False
73 for token in tokens:
74 #print ("track token", token, token.type)
75 token.at_line_start = at_line_start
76
77 if token.type == "COLON":
78 at_line_start = False
79 indent = MAY_INDENT
80 token.must_indent = False
81
82 elif token.type == "NEWLINE":
83 at_line_start = True
84 if indent == MAY_INDENT:
85 indent = MUST_INDENT
86 token.must_indent = False
87
88 elif token.type == "WS":
89 assert token.at_line_start == True
90 at_line_start = True
91 token.must_indent = False
92
93 else:
94 # A real token; only indent after COLON NEWLINE
95 if indent == MUST_INDENT:
96 token.must_indent = True
97 else:
98 token.must_indent = False
99 at_line_start = False
100 indent = NO_INDENT
101
102 # really bad hack that changes ignore lexer state.
103 # when "must indent" is seen (basically "real tokens" seen)
104 # then ignore whitespace.
105 if token.must_indent:
106 lexer.lexignore = ('ignore', ' ')
107 else:
108 lexer.lexignore = oldignore
109
110 token.indent = indent
111 yield token
112 lexer.at_line_start = at_line_start
113
114 def _new_token(type, lineno):
115 tok = lex.LexToken()
116 tok.type = type
117 tok.value = None
118 tok.lineno = lineno
119 tok.lexpos = -1
120 return tok
121
122 # Synthesize a DEDENT tag
123 def DEDENT(lineno):
124 return _new_token("DEDENT", lineno)
125
126 # Synthesize an INDENT tag
127 def INDENT(lineno):
128 return _new_token("INDENT", lineno)
129
130 def count_spaces(l):
131 for i in range(len(l)):
132 if l[i] != ' ':
133 return i
134 return 0
135
136 def annoying_case_hack_filter(code):
137 """add annoying "silent keyword" (fallthrough)
138
139 this which tricks the parser into taking the (silent) case statement
140 as a "small expression". it can then be spotted and used to indicate
141 "fall through" to the next case (in the parser)
142
143 also skips blank lines
144
145 bugs: any function that starts with the letters "case" or "default"
146 will be detected erroneously. fixing that involves doing a token
147 lexer which spots the fact that "case" and "default" are words,
148 separating them from space, colon, bracket etc.
149
150 http://bugs.libre-riscv.org/show_bug.cgi?id=280
151 """
152 res = []
153 prev_spc_count = None
154 for l in code.split("\n"):
155 spc_count = count_spaces(l)
156 nwhite = l[spc_count:]
157 if len(nwhite) == 0: # skip blank lines
158 continue
159 if nwhite.startswith("case") or nwhite.startswith("default"):
160 #print ("case/default", nwhite, spc_count, prev_spc_count)
161 if (prev_spc_count is not None and
162 prev_spc_count == spc_count and
163 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
164 res[-1] += " fallthrough" # add to previous line
165 prev_spc_count = spc_count
166 else:
167 #print ("notstarts", spc_count, nwhite)
168 prev_spc_count = None
169 res.append(l)
170 return '\n'.join(res)
171
172
173 # Track the indentation level and emit the right INDENT / DEDENT events.
174 def indentation_filter(tokens):
175 # A stack of indentation levels; will never pop item 0
176 levels = [0]
177 token = None
178 depth = 0
179 prev_was_ws = False
180 for token in tokens:
181 if 0:
182 print ("Process", depth, token.indent, token,)
183 if token.at_line_start:
184 print ("at_line_start",)
185 if token.must_indent:
186 print ("must_indent",)
187 print
188
189 # WS only occurs at the start of the line
190 # There may be WS followed by NEWLINE so
191 # only track the depth here. Don't indent/dedent
192 # until there's something real.
193 if token.type == "WS":
194 assert depth == 0
195 depth = len(token.value)
196 prev_was_ws = True
197 # WS tokens are never passed to the parser
198 continue
199
200 if token.type == "NEWLINE":
201 depth = 0
202 if prev_was_ws or token.at_line_start:
203 # ignore blank lines
204 continue
205 # pass the other cases on through
206 yield token
207 continue
208
209 # then it must be a real token (not WS, not NEWLINE)
210 # which can affect the indentation level
211
212 prev_was_ws = False
213 if token.must_indent:
214 # The current depth must be larger than the previous level
215 if not (depth > levels[-1]):
216 raise IndentationError("expected an indented block")
217
218 levels.append(depth)
219 yield INDENT(token.lineno)
220
221 elif token.at_line_start:
222 # Must be on the same level or one of the previous levels
223 if depth == levels[-1]:
224 # At the same level
225 pass
226 elif depth > levels[-1]:
227 raise IndentationError("indent increase but not in new block")
228 else:
229 # Back up; but only if it matches a previous level
230 try:
231 i = levels.index(depth)
232 except ValueError:
233 raise IndentationError("inconsistent indentation")
234 for _ in range(i+1, len(levels)):
235 yield DEDENT(token.lineno)
236 levels.pop()
237
238 yield token
239
240 ### Finished processing ###
241
242 # Must dedent any remaining levels
243 if len(levels) > 1:
244 assert token is not None
245 for _ in range(1, len(levels)):
246 yield DEDENT(token.lineno)
247
248
249 # The top-level filter adds an ENDMARKER, if requested.
250 # Python's grammar uses it.
251 def filter(lexer, add_endmarker = True):
252 token = None
253 tokens = iter(lexer.token, None)
254 tokens = python_colonify(lexer, tokens)
255 tokens = track_tokens_filter(lexer, tokens)
256 for token in indentation_filter(tokens):
257 yield token
258
259 if add_endmarker:
260 lineno = 1
261 if token is not None:
262 lineno = token.lineno
263 yield _new_token("ENDMARKER", lineno)
264
265 ##### Lexer ######
266
267 class PowerLexer:
268 tokens = (
269 'DEF',
270 'IF',
271 'THEN',
272 'ELSE',
273 'FOR',
274 'TO',
275 'DO',
276 'WHILE',
277 'BREAK',
278 'NAME',
279 'HEX', # hex numbers
280 'NUMBER', # Python decimals
281 'BINARY', # Python binary
282 'STRING', # single quoted strings only; syntax of raw strings
283 'LPAR',
284 'RPAR',
285 'LBRACK',
286 'RBRACK',
287 'COLON',
288 'EQ',
289 'ASSIGNEA',
290 'ASSIGN',
291 'LTU',
292 'GTU',
293 'NE',
294 'LE',
295 'GE',
296 'LT',
297 'GT',
298 'PLUS',
299 'MINUS',
300 'MULT',
301 'DIV',
302 'MOD',
303 'INVERT',
304 'APPEND',
305 'BITOR',
306 'BITAND',
307 'BITXOR',
308 'RETURN',
309 'SWITCH',
310 'CASE',
311 'DEFAULT',
312 'WS',
313 'NEWLINE',
314 'COMMA',
315 'SEMICOLON',
316 'INDENT',
317 'DEDENT',
318 'ENDMARKER',
319 )
320
321 # Build the lexer
322 def build(self,**kwargs):
323 self.lexer = lex.lex(module=self, **kwargs)
324
325 def t_HEX(self, t):
326 r"""0x[0-9a-fA-F_]+"""
327 val = t.value.replace("_", "")
328 t.value = SelectableInt(int(val, 16), (len(val)-2)*4) # hex = nibble
329 return t
330
331 def t_BINARY(self, t):
332 r"""0b[01]+"""
333 t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
334 return t
335
336 #t_NUMBER = r'\d+'
337 # taken from decmial.py but without the leading sign
338 def t_NUMBER(self, t):
339 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
340 t.value = int(t.value)
341 return t
342
343 def t_STRING(self, t):
344 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
345 print (repr(t.value))
346 t.value=t.value[1:-1]
347 return t
348
349 t_COLON = r':'
350 t_EQ = r'='
351 t_ASSIGNEA = r'<-iea'
352 t_ASSIGN = r'<-'
353 t_LTU = r'<u'
354 t_GTU = r'>u'
355 t_NE = r'!='
356 t_LE = r'<='
357 t_GE = r'>='
358 t_LT = r'<'
359 t_GT = r'>'
360 t_PLUS = r'\+'
361 t_MINUS = r'-'
362 t_MULT = r'\*'
363 t_DIV = r'/'
364 t_MOD = r'%'
365 t_INVERT = r'¬'
366 t_COMMA = r','
367 t_SEMICOLON = r';'
368 t_APPEND = r'\|\|'
369 t_BITOR = r'\|'
370 t_BITAND = r'\&'
371 t_BITXOR = r'\^'
372
373 # Ply nicely documented how to do this.
374
375 RESERVED = {
376 "def": "DEF",
377 "if": "IF",
378 "then": "THEN",
379 "else": "ELSE",
380 "leave": "BREAK",
381 "for": "FOR",
382 "to": "TO",
383 "while": "WHILE",
384 "do": "DO",
385 "return": "RETURN",
386 "switch": "SWITCH",
387 "case": "CASE",
388 "default": "DEFAULT",
389 }
390
391 def t_NAME(self, t):
392 r'[a-zA-Z_][a-zA-Z0-9_]*'
393 t.type = self.RESERVED.get(t.value, "NAME")
394 return t
395
396 # Putting this before t_WS let it consume lines with only comments in
397 # them so the latter code never sees the WS part. Not consuming the
398 # newline. Needed for "if 1: #comment"
399 def t_comment(self, t):
400 r"[ ]*\043[^\n]*" # \043 is '#'
401 pass
402
403
404 # Whitespace
405 def t_WS(self, t):
406 r'[ ]+'
407 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
408 t.lexer.brack_count == 0:
409 return t
410
411 # Don't generate newline tokens when inside of parenthesis, eg
412 # a = (1,
413 # 2, 3)
414 def t_newline(self, t):
415 r'\n+'
416 t.lexer.lineno += len(t.value)
417 t.type = "NEWLINE"
418 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
419 return t
420
421 def t_LBRACK(self, t):
422 r'\['
423 t.lexer.brack_count += 1
424 return t
425
426 def t_RBRACK(self, t):
427 r'\]'
428 # check for underflow? should be the job of the parser
429 t.lexer.brack_count -= 1
430 return t
431
432 def t_LPAR(self, t):
433 r'\('
434 t.lexer.paren_count += 1
435 return t
436
437 def t_RPAR(self, t):
438 r'\)'
439 # check for underflow? should be the job of the parser
440 t.lexer.paren_count -= 1
441 return t
442
443 #t_ignore = " "
444
445 def t_error(self, t):
446 raise SyntaxError("Unknown symbol %r" % (t.value[0],))
447 print ("Skipping", repr(t.value[0]))
448 t.lexer.skip(1)
449
450
451 # Combine Ply and my filters into a new lexer
452
453 class IndentLexer(PowerLexer):
454 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
455 self.debug = debug
456 self.build(debug=debug, optimize=optimize,
457 lextab=lextab, reflags=reflags)
458 self.token_stream = None
459
460 def input(self, s, add_endmarker=True):
461 s = annoying_case_hack_filter(s)
462 if self.debug:
463 print (s)
464 s += "\n"
465 self.lexer.paren_count = 0
466 self.lexer.brack_count = 0
467 self.lexer.input(s)
468 self.token_stream = filter(self.lexer, add_endmarker)
469
470 def token(self):
471 try:
472 return next(self.token_stream)
473 except StopIteration:
474 return None
475
476 switchtest = """
477 switch (n)
478 case(1): x <- 5
479 case(3): x <- 2
480 case(2):
481
482 case(4):
483 x <- 3
484 case(9):
485
486 default:
487 x <- 9
488 print (5)
489 """
490
491 cnttzd = """
492 n <- 0
493 do while n < 64
494 if (RS)[63-n] = 0b1 then
495 leave
496 n <- n + 1
497 RA <- EXTZ64(n)
498 print (RA)
499 """
500
501 if __name__ == '__main__':
502
503 # quick test/demo
504 #code = cnttzd
505 code = switchtest
506 print (code)
507
508 lexer = IndentLexer(debug=1)
509 # Give the lexer some input
510 print ("code")
511 print (code)
512 lexer.input(code)
513
514 tokens = iter(lexer.token, None)
515 for token in tokens:
516 print (token)
517