Revert "add support for pseudocode being a [[!inline]] directive"
[openpower-isa.git] / src / openpower / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from openpower.decoder.selectable_int import SelectableInt
14
15
16 class SyntaxError2(Exception):
17 """ class used to raise a syntax error but get ply to stop eating errors
18 since it catches and discards SyntaxError after setting a flag.
19 """
20
21 def __init__(self, *args, cls=SyntaxError):
22 super().__init__(*args)
23 self.cls = cls
24
25 def __repr__(self):
26 return repr(self.cls(*self.args))
27
28 def __str__(self):
29 return str(self.cls(*self.args))
30
31 def raise_syntax_error(self):
32 raise self.cls(*self.args) from self
33
34
35 def raise_syntax_error(msg, filename, lineno, lexpos, input_text,
36 cls=SyntaxError):
37 line_start = input_text.rfind('\n', 0, lexpos) + 1
38 line_end = input_text.find('\n', line_start)
39 col = (lexpos - line_start) + 1
40 raise SyntaxError2(str(msg), (filename, lineno, col,
41 input_text[line_start:line_end]), cls=cls)
42
43 # I implemented INDENT / DEDENT generation as a post-processing filter
44
45 # The original lex token stream contains WS and NEWLINE characters.
46 # WS will only occur before any other tokens on a line.
47
48 # I have three filters. One tags tokens by adding two attributes.
49 # "must_indent" is True if the token must be indented from the
50 # previous code. The other is "at_line_start" which is True for WS
51 # and the first non-WS/non-NEWLINE on a line. It flags the check so
52 # see if the new line has changed indication level.
53
54 # Python's syntax has three INDENT states
55 # 0) no colon hence no need to indent
56 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
57 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
58 NO_INDENT = 0
59 MAY_INDENT = 1
60 MUST_INDENT = 2
61
62 # turn into python-like colon syntax from pseudo-code syntax.
63 # identify tokens which tell us whether a "hidden colon" is needed.
64 # this in turn means that track_tokens_filter "works" without needing
65 # complex grammar rules
66
67
68 def python_colonify(lexer, tokens):
69
70 implied_colon_needed = False
71 for token in tokens:
72 #print ("track colon token", token, token.type)
73
74 if token.type == 'THEN':
75 # turn then into colon
76 token.type = "COLON"
77 yield token
78 elif token.type == 'ELSE':
79 yield token
80 token = copy(token)
81 token.type = "COLON"
82 yield token
83 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
84 implied_colon_needed = True
85 yield token
86 elif token.type == 'NEWLINE':
87 if implied_colon_needed:
88 ctok = copy(token)
89 ctok.type = "COLON"
90 yield ctok
91 implied_colon_needed = False
92 yield token
93 else:
94 yield token
95
96
97 # only care about whitespace at the start of a line
98 def track_tokens_filter(lexer, tokens):
99 oldignore = lexer.lexignore
100 lexer.at_line_start = at_line_start = True
101 indent = NO_INDENT
102 saw_colon = False
103 for token in tokens:
104 #print ("track token", token, token.type)
105 token.at_line_start = at_line_start
106
107 if token.type == "COLON":
108 at_line_start = False
109 indent = MAY_INDENT
110 token.must_indent = False
111
112 elif token.type == "NEWLINE":
113 at_line_start = True
114 if indent == MAY_INDENT:
115 indent = MUST_INDENT
116 token.must_indent = False
117
118 elif token.type == "WS":
119 assert token.at_line_start == True
120 at_line_start = True
121 token.must_indent = False
122
123 else:
124 # A real token; only indent after COLON NEWLINE
125 if indent == MUST_INDENT:
126 token.must_indent = True
127 else:
128 token.must_indent = False
129 at_line_start = False
130 indent = NO_INDENT
131
132 # really bad hack that changes ignore lexer state.
133 # when "must indent" is seen (basically "real tokens" seen)
134 # then ignore whitespace.
135 if token.must_indent:
136 lexer.lexignore = ('ignore', ' ')
137 else:
138 lexer.lexignore = oldignore
139
140 token.indent = indent
141 yield token
142 lexer.at_line_start = at_line_start
143
144
145 def _new_token(type, lineno):
146 tok = lex.LexToken()
147 tok.type = type
148 tok.value = None
149 tok.lineno = lineno
150 tok.lexpos = -1
151 return tok
152
153 # Synthesize a DEDENT tag
154
155
156 def DEDENT(lineno):
157 return _new_token("DEDENT", lineno)
158
159 # Synthesize an INDENT tag
160
161
162 def INDENT(lineno):
163 return _new_token("INDENT", lineno)
164
165
166 def count_spaces(l):
167 for i in range(len(l)):
168 if l[i] != ' ':
169 return i
170 return 0
171
172
173 def annoying_case_hack_filter(code):
174 """add annoying "silent keyword" (fallthrough)
175
176 this which tricks the parser into taking the (silent) case statement
177 as a "small expression". it can then be spotted and used to indicate
178 "fall through" to the next case (in the parser)
179
180 also skips blank lines
181
182 bugs: any function that starts with the letters "case" or "default"
183 will be detected erroneously. fixing that involves doing a token
184 lexer which spots the fact that "case" and "default" are words,
185 separating them from space, colon, bracket etc.
186
187 http://bugs.libre-riscv.org/show_bug.cgi?id=280
188 """
189 res = []
190 prev_spc_count = None
191 for l in code.split("\n"):
192 spc_count = count_spaces(l)
193 nwhite = l[spc_count:]
194 if len(nwhite) == 0: # skip blank lines
195 res.append('')
196 continue
197 if nwhite.startswith("case") or nwhite.startswith("default"):
198 #print ("case/default", nwhite, spc_count, prev_spc_count)
199 if (prev_spc_count is not None and
200 prev_spc_count == spc_count and
201 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
202 res[-1] += " fallthrough" # add to previous line
203 prev_spc_count = spc_count
204 else:
205 #print ("notstarts", spc_count, nwhite)
206 prev_spc_count = None
207 res.append(l)
208 return '\n'.join(res)
209
210
211 # Track the indentation level and emit the right INDENT / DEDENT events.
212 def indentation_filter(tokens, filename):
213 # A stack of indentation levels; will never pop item 0
214 levels = [0]
215 token = None
216 depth = 0
217 prev_was_ws = False
218 for token in tokens:
219 if 0:
220 print("Process", depth, token.indent, token,)
221 if token.at_line_start:
222 print("at_line_start",)
223 if token.must_indent:
224 print("must_indent",)
225 print
226
227 # WS only occurs at the start of the line
228 # There may be WS followed by NEWLINE so
229 # only track the depth here. Don't indent/dedent
230 # until there's something real.
231 if token.type == "WS":
232 assert depth == 0
233 depth = len(token.value)
234 prev_was_ws = True
235 # WS tokens are never passed to the parser
236 continue
237
238 if token.type == "NEWLINE":
239 depth = 0
240 if prev_was_ws or token.at_line_start:
241 # ignore blank lines
242 continue
243 # pass the other cases on through
244 yield token
245 continue
246
247 # then it must be a real token (not WS, not NEWLINE)
248 # which can affect the indentation level
249
250 prev_was_ws = False
251 if token.must_indent:
252 # The current depth must be larger than the previous level
253 if not (depth > levels[-1]):
254 raise_syntax_error("expected an indented block",
255 filename, token.lexer.lineno,
256 token.lexer.lexpos, token.lexer.lexdata,
257 cls=IndentationError)
258
259 levels.append(depth)
260 yield INDENT(token.lineno)
261
262 elif token.at_line_start:
263 # Must be on the same level or one of the previous levels
264 if depth == levels[-1]:
265 # At the same level
266 pass
267 elif depth > levels[-1]:
268 raise_syntax_error("indent increase but not in new block",
269 filename, token.lexer.lineno,
270 token.lexer.lexpos, token.lexer.lexdata,
271 cls=IndentationError)
272 else:
273 # Back up; but only if it matches a previous level
274 try:
275 i = levels.index(depth)
276 except ValueError:
277 raise_syntax_error("inconsistent indentation",
278 filename, token.lexer.lineno,
279 token.lexer.lexpos, token.lexer.lexdata,
280 cls=IndentationError)
281 for _ in range(i+1, len(levels)):
282 yield DEDENT(token.lineno)
283 levels.pop()
284
285 yield token
286
287 ### Finished processing ###
288
289 # Must dedent any remaining levels
290 if len(levels) > 1:
291 assert token is not None
292 for _ in range(1, len(levels)):
293 yield DEDENT(token.lineno)
294
295
296 # The top-level filter adds an ENDMARKER, if requested.
297 # Python's grammar uses it.
298 def filter(lexer, add_endmarker, filename):
299 token = None
300 tokens = iter(lexer.token, None)
301 tokens = python_colonify(lexer, tokens)
302 tokens = track_tokens_filter(lexer, tokens)
303 for token in indentation_filter(tokens, filename):
304 yield token
305
306 if add_endmarker:
307 lineno = 1
308 if token is not None:
309 lineno = token.lineno
310 yield _new_token("ENDMARKER", lineno)
311
312
313 KEYWORD_REPLACEMENTS = {'class': 'class_'}
314
315 ##### Lexer ######
316
317
318 class PowerLexer:
319 tokens = (
320 'DEF',
321 'IF',
322 'THEN',
323 'ELSE',
324 'FOR',
325 'TO',
326 'DO',
327 'WHILE',
328 'BREAK',
329 'NAME',
330 'HEX', # hex numbers
331 'NUMBER', # Python decimals
332 'BINARY', # Python binary
333 'STRING', # single quoted strings only; syntax of raw strings
334 'LPAR',
335 'RPAR',
336 'LBRACK',
337 'RBRACK',
338 'COLON',
339 'EQ',
340 'ASSIGNEA',
341 'ASSIGN',
342 'LTU',
343 'GTU',
344 'NE',
345 'LE',
346 'GE',
347 'LT',
348 'GT',
349 'PLUS',
350 'MINUS',
351 'MULT',
352 'DIV',
353 'MOD',
354 'INVERT',
355 'APPEND',
356 'BITOR',
357 'BITAND',
358 'BITXOR',
359 'RETURN',
360 'SWITCH',
361 'CASE',
362 'DEFAULT',
363 'WS',
364 'NEWLINE',
365 'COMMA',
366 'QMARK',
367 'PERIOD',
368 'SEMICOLON',
369 'INDENT',
370 'DEDENT',
371 'ENDMARKER',
372 )
373
374 # Build the lexer
375 def build(self, **kwargs):
376 self.lexer = lex.lex(module=self, **kwargs)
377 self.filename = None
378
379 def t_HEX(self, t):
380 r"""0x[0-9a-fA-F_]+"""
381 val = t.value.replace("_", "")
382 t.value = SelectableInt(int(val, 16), (len(val)-2)*4) # hex = nibble
383 return t
384
385 def t_BINARY(self, t):
386 r"""0b[01_]+"""
387 val = t.value.replace("_", "")
388 t.value = SelectableInt(int(val, 2), len(val)-2)
389 return t
390
391 #t_NUMBER = r'\d+'
392 # taken from decmial.py but without the leading sign
393 def t_NUMBER(self, t):
394 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
395 t.value = int(t.value)
396 return t
397
398 def t_STRING(self, t):
399 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
400 print(repr(t.value))
401 t.value = t.value[1:-1]
402 return t
403
404 t_COLON = r':'
405 t_EQ = r'='
406 t_ASSIGNEA = r'<-iea'
407 t_ASSIGN = r'<-'
408 t_LTU = r'<u'
409 t_GTU = r'>u'
410 t_NE = r'!='
411 t_LE = r'<='
412 t_GE = r'>='
413 t_LT = r'<'
414 t_GT = r'>'
415 t_PLUS = r'\+'
416 t_MINUS = r'-'
417 t_MULT = r'\*'
418 t_DIV = r'/'
419 t_MOD = r'%'
420 t_INVERT = r'¬'
421 t_COMMA = r','
422 t_PERIOD = r'.'
423 t_SEMICOLON = r';'
424 t_APPEND = r'\|\|'
425 t_BITOR = r'\|'
426 t_BITAND = r'\&'
427 t_BITXOR = r'\^'
428 t_QMARK = r'\?'
429
430 # Ply nicely documented how to do this.
431
432 RESERVED = {
433 "def": "DEF",
434 "if": "IF",
435 "then": "THEN",
436 "else": "ELSE",
437 "leave": "BREAK",
438 "for": "FOR",
439 "to": "TO",
440 "while": "WHILE",
441 "do": "DO",
442 "return": "RETURN",
443 "switch": "SWITCH",
444 "case": "CASE",
445 "default": "DEFAULT",
446 }
447
448 def t_NAME(self, t):
449 r'[a-zA-Z_][a-zA-Z0-9_]*'
450 t.type = self.RESERVED.get(t.value, "NAME")
451 if t.value in KEYWORD_REPLACEMENTS:
452 t.value = KEYWORD_REPLACEMENTS[t.value]
453 return t
454
455 # Putting this before t_WS let it consume lines with only comments in
456 # them so the latter code never sees the WS part. Not consuming the
457 # newline. Needed for "if 1: #comment"
458 def t_comment(self, t):
459 r"[ ]*\043[^\n]*" # \043 is '#'
460 pass
461
462 # Whitespace
463
464 def t_WS(self, t):
465 r'[ ]+'
466 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
467 t.lexer.brack_count == 0:
468 return t
469
470 # Don't generate newline tokens when inside of parenthesis, eg
471 # a = (1,
472 # 2, 3)
473 def t_newline(self, t):
474 r'\n+'
475 t.lexer.lineno += len(t.value)
476 t.type = "NEWLINE"
477 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
478 return t
479
480 def t_LBRACK(self, t):
481 r'\['
482 t.lexer.brack_count += 1
483 return t
484
485 def t_RBRACK(self, t):
486 r'\]'
487 # check for underflow? should be the job of the parser
488 t.lexer.brack_count -= 1
489 return t
490
491 def t_LPAR(self, t):
492 r'\('
493 t.lexer.paren_count += 1
494 return t
495
496 def t_RPAR(self, t):
497 r'\)'
498 # check for underflow? should be the job of the parser
499 t.lexer.paren_count -= 1
500 return t
501
502 #t_ignore = " "
503
504 def t_error(self, t):
505 raise_syntax_error("Unknown symbol %r" % (t.value[0],),
506 self.filename, t.lexer.lineno,
507 t.lexer.lexpos, t.lexer.lexdata)
508 print("Skipping", repr(t.value[0]))
509 t.lexer.skip(1)
510
511
512 # Combine Ply and my filters into a new lexer
513
514 class IndentLexer(PowerLexer):
515 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
516 self.debug = debug
517 self.build(debug=debug, optimize=optimize,
518 lextab=lextab, reflags=reflags)
519 self.token_stream = None
520
521 def input(self, s, add_endmarker=True):
522 s = annoying_case_hack_filter(s)
523 if self.debug:
524 print(s)
525 s += "\n"
526 self.lexer.paren_count = 0
527 self.lexer.brack_count = 0
528 self.lexer.lineno = 1
529 self.lexer.input(s)
530 self.token_stream = filter(self.lexer, add_endmarker, self.filename)
531
532 def token(self):
533 try:
534 return next(self.token_stream)
535 except StopIteration:
536 return None
537
538
539 switchtest = """
540 switch (n)
541 case(1): x <- 5
542 case(3): x <- 2
543 case(2):
544
545 case(4):
546 x <- 3
547 case(9):
548
549 default:
550 x <- 9
551 print (5)
552 """
553
554 cnttzd = """
555 n <- 0
556 do while n < 64
557 if (RS)[63-n] = 0b1 then
558 leave
559 n <- n + 1
560 RA <- EXTZ64(n)
561 print (RA)
562 """
563
564 if __name__ == '__main__':
565
566 # quick test/demo
567 #code = cnttzd
568 code = switchtest
569 print(code)
570
571 lexer = IndentLexer(debug=1)
572 # Give the lexer some input
573 print("code")
574 print(code)
575 lexer.input(code)
576
577 tokens = iter(lexer.token, None)
578 for token in tokens:
579 print(token)