pysvp64db: fix traversal
[openpower-isa.git] / src / openpower / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from openpower.decoder.selectable_int import SelectableInt
14
15
16 class SyntaxError2(Exception):
17 """ class used to raise a syntax error but get ply to stop eating errors
18 since it catches and discards SyntaxError after setting a flag.
19 """
20
21 def __init__(self, *args, cls=SyntaxError):
22 super().__init__(*args)
23 self.cls = cls
24
25 def __repr__(self):
26 return repr(self.cls(*self.args))
27
28 def __str__(self):
29 return str(self.cls(*self.args))
30
31 def raise_syntax_error(self):
32 raise self.cls(*self.args) from self
33
34
35 def raise_syntax_error(msg, filename, lineno, lexpos, input_text,
36 cls=SyntaxError):
37 line_start = input_text.rfind('\n', 0, lexpos) + 1
38 line_end = input_text.find('\n', line_start)
39 col = (lexpos - line_start) + 1
40 raise SyntaxError2(str(msg), (filename, lineno, col,
41 input_text[line_start:line_end]), cls=cls)
42
43 # I implemented INDENT / DEDENT generation as a post-processing filter
44
45 # The original lex token stream contains WS and NEWLINE characters.
46 # WS will only occur before any other tokens on a line.
47
48 # I have three filters. One tags tokens by adding two attributes.
49 # "must_indent" is True if the token must be indented from the
50 # previous code. The other is "at_line_start" which is True for WS
51 # and the first non-WS/non-NEWLINE on a line. It flags the check so
52 # see if the new line has changed indication level.
53
54 # Python's syntax has three INDENT states
55 # 0) no colon hence no need to indent
56 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
57 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
58 NO_INDENT = 0
59 MAY_INDENT = 1
60 MUST_INDENT = 2
61
62 # turn into python-like colon syntax from pseudo-code syntax.
63 # identify tokens which tell us whether a "hidden colon" is needed.
64 # this in turn means that track_tokens_filter "works" without needing
65 # complex grammar rules
66
67
68 def python_colonify(lexer, tokens):
69
70 implied_colon_needed = False
71 for token in tokens:
72 #print ("track colon token", token, token.type)
73
74 if token.type == 'THEN':
75 # turn then into colon
76 token.type = "COLON"
77 yield token
78 elif token.type == 'ELSE':
79 yield token
80 token = copy(token)
81 token.type = "COLON"
82 yield token
83 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
84 implied_colon_needed = True
85 yield token
86 elif token.type == 'NEWLINE':
87 if implied_colon_needed:
88 ctok = copy(token)
89 ctok.type = "COLON"
90 yield ctok
91 implied_colon_needed = False
92 yield token
93 else:
94 yield token
95
96
97 # only care about whitespace at the start of a line
98 def track_tokens_filter(lexer, tokens):
99 oldignore = lexer.lexignore
100 lexer.at_line_start = at_line_start = True
101 indent = NO_INDENT
102 saw_colon = False
103 for token in tokens:
104 #print ("track token", token, token.type)
105 token.at_line_start = at_line_start
106
107 if token.type == "COLON":
108 at_line_start = False
109 indent = MAY_INDENT
110 token.must_indent = False
111
112 elif token.type == "NEWLINE":
113 at_line_start = True
114 if indent == MAY_INDENT:
115 indent = MUST_INDENT
116 token.must_indent = False
117
118 elif token.type == "WS":
119 assert token.at_line_start == True
120 at_line_start = True
121 token.must_indent = False
122
123 else:
124 # A real token; only indent after COLON NEWLINE
125 if indent == MUST_INDENT:
126 token.must_indent = True
127 else:
128 token.must_indent = False
129 at_line_start = False
130 indent = NO_INDENT
131
132 # really bad hack that changes ignore lexer state.
133 # when "must indent" is seen (basically "real tokens" seen)
134 # then ignore whitespace.
135 if token.must_indent:
136 lexer.lexignore = ('ignore', ' ')
137 else:
138 lexer.lexignore = oldignore
139
140 token.indent = indent
141 yield token
142 lexer.at_line_start = at_line_start
143
144
145 def _new_token(type, lineno):
146 tok = lex.LexToken()
147 tok.type = type
148 tok.value = None
149 tok.lineno = lineno
150 tok.lexpos = -1
151 return tok
152
153 # Synthesize a DEDENT tag
154
155
156 def DEDENT(lineno):
157 return _new_token("DEDENT", lineno)
158
159 # Synthesize an INDENT tag
160
161
162 def INDENT(lineno):
163 return _new_token("INDENT", lineno)
164
165
166 def count_spaces(l):
167 for i in range(len(l)):
168 if l[i] != ' ':
169 return i
170 return 0
171
172
173 def annoying_case_hack_filter(code):
174 """add annoying "silent keyword" (fallthrough)
175
176 this which tricks the parser into taking the (silent) case statement
177 as a "small expression". it can then be spotted and used to indicate
178 "fall through" to the next case (in the parser)
179
180 also skips blank lines
181
182 bugs: any function that starts with the letters "case" or "default"
183 will be detected erroneously. fixing that involves doing a token
184 lexer which spots the fact that "case" and "default" are words,
185 separating them from space, colon, bracket etc.
186
187 http://bugs.libre-riscv.org/show_bug.cgi?id=280
188 """
189 res = []
190 prev_spc_count = None
191 for l in code.split("\n"):
192 spc_count = count_spaces(l)
193 nwhite = l[spc_count:]
194 if len(nwhite) == 0: # skip blank lines
195 res.append('')
196 continue
197 if nwhite.startswith("case") or nwhite.startswith("default"):
198 #print ("case/default", nwhite, spc_count, prev_spc_count)
199 if (prev_spc_count is not None and
200 prev_spc_count == spc_count and
201 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
202 res[-1] += " fallthrough" # add to previous line
203 prev_spc_count = spc_count
204 else:
205 #print ("notstarts", spc_count, nwhite)
206 prev_spc_count = None
207 res.append(l)
208 return '\n'.join(res)
209
210
211 # Track the indentation level and emit the right INDENT / DEDENT events.
212 def indentation_filter(tokens, filename):
213 # A stack of indentation levels; will never pop item 0
214 levels = [0]
215 token = None
216 depth = 0
217 prev_was_ws = False
218 for token in tokens:
219 if 0:
220 print("Process", depth, token.indent, token,)
221 if token.at_line_start:
222 print("at_line_start",)
223 if token.must_indent:
224 print("must_indent",)
225 print
226
227 # WS only occurs at the start of the line
228 # There may be WS followed by NEWLINE so
229 # only track the depth here. Don't indent/dedent
230 # until there's something real.
231 if token.type == "WS":
232 assert depth == 0
233 depth = len(token.value)
234 prev_was_ws = True
235 # WS tokens are never passed to the parser
236 continue
237
238 if token.type == "NEWLINE":
239 depth = 0
240 if prev_was_ws or token.at_line_start:
241 # ignore blank lines
242 continue
243 # pass the other cases on through
244 yield token
245 continue
246
247 # then it must be a real token (not WS, not NEWLINE)
248 # which can affect the indentation level
249
250 prev_was_ws = False
251 if token.must_indent:
252 # The current depth must be larger than the previous level
253 if not (depth > levels[-1]):
254 raise_syntax_error("expected an indented block",
255 filename, token.lexer.lineno,
256 token.lexer.lexpos, token.lexer.lexdata,
257 cls=IndentationError)
258
259 levels.append(depth)
260 yield INDENT(token.lineno)
261
262 elif token.at_line_start:
263 # Must be on the same level or one of the previous levels
264 if depth == levels[-1]:
265 # At the same level
266 pass
267 elif depth > levels[-1]:
268 raise_syntax_error("indent increase but not in new block",
269 filename, token.lexer.lineno,
270 token.lexer.lexpos, token.lexer.lexdata,
271 cls=IndentationError)
272 else:
273 # Back up; but only if it matches a previous level
274 try:
275 i = levels.index(depth)
276 except ValueError:
277 raise_syntax_error("inconsistent indentation",
278 filename, token.lexer.lineno,
279 token.lexer.lexpos, token.lexer.lexdata,
280 cls=IndentationError)
281 for _ in range(i+1, len(levels)):
282 yield DEDENT(token.lineno)
283 levels.pop()
284
285 yield token
286
287 ### Finished processing ###
288
289 # Must dedent any remaining levels
290 if len(levels) > 1:
291 assert token is not None
292 for _ in range(1, len(levels)):
293 yield DEDENT(token.lineno)
294
295
296 # The top-level filter adds an ENDMARKER, if requested.
297 # Python's grammar uses it.
298 def filter(lexer, add_endmarker, filename):
299 token = None
300 tokens = iter(lexer.token, None)
301 tokens = python_colonify(lexer, tokens)
302 tokens = track_tokens_filter(lexer, tokens)
303 for token in indentation_filter(tokens, filename):
304 yield token
305
306 if add_endmarker:
307 lineno = 1
308 if token is not None:
309 lineno = token.lineno
310 yield _new_token("ENDMARKER", lineno)
311
312
313 KEYWORD_REPLACEMENTS = {'class': 'class_'}
314
315 ##### Lexer ######
316
317
318 class PowerLexer:
319 tokens = (
320 'DEF',
321 'IF',
322 'THEN',
323 'ELSE',
324 'FOR',
325 'TO',
326 'DO',
327 'WHILE',
328 'BREAK',
329 'NAME',
330 'HEX', # hex numbers
331 'NUMBER', # Python decimals
332 'BINARY', # Python binary
333 'STRING', # single quoted strings only; syntax of raw strings
334 'LPAR',
335 'RPAR',
336 'LBRACK',
337 'RBRACK',
338 'COLON',
339 'EQ',
340 'ASSIGNEA',
341 'ASSIGN',
342 'LTU',
343 'GTU',
344 'NE',
345 'LE',
346 'LSHIFT',
347 'RSHIFT',
348 'GE',
349 'LT',
350 'GT',
351 'PLUS',
352 'MINUS',
353 'MULT',
354 'DIV',
355 'MOD',
356 'INVERT',
357 'APPEND',
358 'BITOR',
359 'BITAND',
360 'BITXOR',
361 'RETURN',
362 'SWITCH',
363 'CASE',
364 'DEFAULT',
365 'WS',
366 'NEWLINE',
367 'COMMA',
368 'QMARK',
369 'PERIOD',
370 'SEMICOLON',
371 'INDENT',
372 'DEDENT',
373 'ENDMARKER',
374 )
375
376 # Build the lexer
377 def build(self, **kwargs):
378 self.lexer = lex.lex(module=self, **kwargs)
379 self.filename = None
380
381 def t_HEX(self, t):
382 r"""0x[0-9a-fA-F_]+"""
383 val = t.value.replace("_", "")
384 t.value = SelectableInt(int(val, 16), (len(val)-2)*4) # hex = nibble
385 return t
386
387 def t_BINARY(self, t):
388 r"""0b[01_]+"""
389 val = t.value.replace("_", "")
390 t.value = SelectableInt(int(val, 2), len(val)-2)
391 return t
392
393 #t_NUMBER = r'\d+'
394 # taken from decmial.py but without the leading sign
395 def t_NUMBER(self, t):
396 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
397 t.value = int(t.value)
398 return t
399
400 def t_STRING(self, t):
401 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
402 print(repr(t.value))
403 t.value = t.value[1:-1]
404 return t
405
406 t_COLON = r':'
407 t_EQ = r'='
408 t_ASSIGNEA = r'<-iea'
409 t_ASSIGN = r'<-'
410 t_LTU = r'<u'
411 t_GTU = r'>u'
412 t_NE = r'!='
413 t_LE = r'<='
414 t_GE = r'>='
415 t_LSHIFT = r'<<'
416 t_RSHIFT = r'>>'
417 t_LT = r'<'
418 t_GT = r'>'
419 t_PLUS = r'\+'
420 t_MINUS = r'-'
421 t_MULT = r'\*'
422 t_DIV = r'/'
423 t_MOD = r'%'
424 t_INVERT = r'¬'
425 t_COMMA = r','
426 t_PERIOD = r'.'
427 t_SEMICOLON = r';'
428 t_APPEND = r'\|\|'
429 t_BITOR = r'\|'
430 t_BITAND = r'\&'
431 t_BITXOR = r'\^'
432 t_QMARK = r'\?'
433
434 # Ply nicely documented how to do this.
435
436 RESERVED = {
437 "def": "DEF",
438 "if": "IF",
439 "then": "THEN",
440 "else": "ELSE",
441 "leave": "BREAK",
442 "for": "FOR",
443 "to": "TO",
444 "while": "WHILE",
445 "do": "DO",
446 "return": "RETURN",
447 "switch": "SWITCH",
448 "case": "CASE",
449 "default": "DEFAULT",
450 }
451
452 def t_NAME(self, t):
453 r'[a-zA-Z_][a-zA-Z0-9_]*'
454 t.type = self.RESERVED.get(t.value, "NAME")
455 if t.value in KEYWORD_REPLACEMENTS:
456 t.value = KEYWORD_REPLACEMENTS[t.value]
457 return t
458
459 # Putting this before t_WS let it consume lines with only comments in
460 # them so the latter code never sees the WS part. Not consuming the
461 # newline. Needed for "if 1: #comment"
462 def t_comment(self, t):
463 r"[ ]*\043[^\n]*" # \043 is '#'
464 pass
465
466 # Whitespace
467
468 def t_WS(self, t):
469 r'[ ]+'
470 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
471 t.lexer.brack_count == 0:
472 return t
473
474 # Don't generate newline tokens when inside of parenthesis, eg
475 # a = (1,
476 # 2, 3)
477 def t_newline(self, t):
478 r'\n+'
479 t.lexer.lineno += len(t.value)
480 t.type = "NEWLINE"
481 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
482 return t
483
484 def t_LBRACK(self, t):
485 r'\['
486 t.lexer.brack_count += 1
487 return t
488
489 def t_RBRACK(self, t):
490 r'\]'
491 # check for underflow? should be the job of the parser
492 t.lexer.brack_count -= 1
493 return t
494
495 def t_LPAR(self, t):
496 r'\('
497 t.lexer.paren_count += 1
498 return t
499
500 def t_RPAR(self, t):
501 r'\)'
502 # check for underflow? should be the job of the parser
503 t.lexer.paren_count -= 1
504 return t
505
506 #t_ignore = " "
507
508 def t_error(self, t):
509 raise_syntax_error("Unknown symbol %r" % (t.value[0],),
510 self.filename, t.lexer.lineno,
511 t.lexer.lexpos, t.lexer.lexdata)
512 print("Skipping", repr(t.value[0]))
513 t.lexer.skip(1)
514
515
516 # Combine Ply and my filters into a new lexer
517
518 class IndentLexer(PowerLexer):
519 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
520 self.debug = debug
521 self.build(debug=debug, optimize=optimize,
522 lextab=lextab, reflags=reflags)
523 self.token_stream = None
524
525 def input(self, s, add_endmarker=True):
526 s = annoying_case_hack_filter(s)
527 if self.debug:
528 print(s)
529 s += "\n"
530 self.lexer.paren_count = 0
531 self.lexer.brack_count = 0
532 self.lexer.lineno = 1
533 self.lexer.input(s)
534 self.token_stream = filter(self.lexer, add_endmarker, self.filename)
535
536 def token(self):
537 try:
538 return next(self.token_stream)
539 except StopIteration:
540 return None
541
542
543 switchtest = """
544 switch (n)
545 case(1): x <- 5
546 case(3): x <- 2
547 case(2):
548
549 case(4):
550 x <- 3
551 case(9):
552
553 default:
554 x <- 9
555 print (5)
556 """
557
558 cnttzd = """
559 n <- 0
560 do while n < 64
561 if (RS)[63-n] = 0b1 then
562 leave
563 n <- n + 1
564 RA <- EXTZ64(n)
565 print (RA)
566 """
567
568 if __name__ == '__main__':
569
570 # quick test/demo
571 #code = cnttzd
572 code = switchtest
573 print(code)
574
575 lexer = IndentLexer(debug=1)
576 # Give the lexer some input
577 print("code")
578 print(code)
579 lexer.input(code)
580
581 tokens = iter(lexer.token, None)
582 for token in tokens:
583 print(token)