move ffadds to not conflict with fptrans -- makes space for min/max/fmod/remainder ops
[openpower-isa.git] / src / openpower / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from openpower.decoder.selectable_int import SelectableInt
14
15 # I implemented INDENT / DEDENT generation as a post-processing filter
16
17 # The original lex token stream contains WS and NEWLINE characters.
18 # WS will only occur before any other tokens on a line.
19
20 # I have three filters. One tags tokens by adding two attributes.
21 # "must_indent" is True if the token must be indented from the
22 # previous code. The other is "at_line_start" which is True for WS
23 # and the first non-WS/non-NEWLINE on a line. It flags the check so
24 # see if the new line has changed indication level.
25
26 # Python's syntax has three INDENT states
27 # 0) no colon hence no need to indent
28 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
29 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
30 NO_INDENT = 0
31 MAY_INDENT = 1
32 MUST_INDENT = 2
33
34 # turn into python-like colon syntax from pseudo-code syntax.
35 # identify tokens which tell us whether a "hidden colon" is needed.
36 # this in turn means that track_tokens_filter "works" without needing
37 # complex grammar rules
38
39
40 def python_colonify(lexer, tokens):
41
42 implied_colon_needed = False
43 for token in tokens:
44 #print ("track colon token", token, token.type)
45
46 if token.type == 'THEN':
47 # turn then into colon
48 token.type = "COLON"
49 yield token
50 elif token.type == 'ELSE':
51 yield token
52 token = copy(token)
53 token.type = "COLON"
54 yield token
55 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
56 implied_colon_needed = True
57 yield token
58 elif token.type == 'NEWLINE':
59 if implied_colon_needed:
60 ctok = copy(token)
61 ctok.type = "COLON"
62 yield ctok
63 implied_colon_needed = False
64 yield token
65 else:
66 yield token
67
68
69 # only care about whitespace at the start of a line
70 def track_tokens_filter(lexer, tokens):
71 oldignore = lexer.lexignore
72 lexer.at_line_start = at_line_start = True
73 indent = NO_INDENT
74 saw_colon = False
75 for token in tokens:
76 #print ("track token", token, token.type)
77 token.at_line_start = at_line_start
78
79 if token.type == "COLON":
80 at_line_start = False
81 indent = MAY_INDENT
82 token.must_indent = False
83
84 elif token.type == "NEWLINE":
85 at_line_start = True
86 if indent == MAY_INDENT:
87 indent = MUST_INDENT
88 token.must_indent = False
89
90 elif token.type == "WS":
91 assert token.at_line_start == True
92 at_line_start = True
93 token.must_indent = False
94
95 else:
96 # A real token; only indent after COLON NEWLINE
97 if indent == MUST_INDENT:
98 token.must_indent = True
99 else:
100 token.must_indent = False
101 at_line_start = False
102 indent = NO_INDENT
103
104 # really bad hack that changes ignore lexer state.
105 # when "must indent" is seen (basically "real tokens" seen)
106 # then ignore whitespace.
107 if token.must_indent:
108 lexer.lexignore = ('ignore', ' ')
109 else:
110 lexer.lexignore = oldignore
111
112 token.indent = indent
113 yield token
114 lexer.at_line_start = at_line_start
115
116
117 def _new_token(type, lineno):
118 tok = lex.LexToken()
119 tok.type = type
120 tok.value = None
121 tok.lineno = lineno
122 tok.lexpos = -1
123 return tok
124
125 # Synthesize a DEDENT tag
126
127
128 def DEDENT(lineno):
129 return _new_token("DEDENT", lineno)
130
131 # Synthesize an INDENT tag
132
133
134 def INDENT(lineno):
135 return _new_token("INDENT", lineno)
136
137
138 def count_spaces(l):
139 for i in range(len(l)):
140 if l[i] != ' ':
141 return i
142 return 0
143
144
145 def annoying_case_hack_filter(code):
146 """add annoying "silent keyword" (fallthrough)
147
148 this which tricks the parser into taking the (silent) case statement
149 as a "small expression". it can then be spotted and used to indicate
150 "fall through" to the next case (in the parser)
151
152 also skips blank lines
153
154 bugs: any function that starts with the letters "case" or "default"
155 will be detected erroneously. fixing that involves doing a token
156 lexer which spots the fact that "case" and "default" are words,
157 separating them from space, colon, bracket etc.
158
159 http://bugs.libre-riscv.org/show_bug.cgi?id=280
160 """
161 res = []
162 prev_spc_count = None
163 for l in code.split("\n"):
164 spc_count = count_spaces(l)
165 nwhite = l[spc_count:]
166 if len(nwhite) == 0: # skip blank lines
167 res.append('')
168 continue
169 if nwhite.startswith("case") or nwhite.startswith("default"):
170 #print ("case/default", nwhite, spc_count, prev_spc_count)
171 if (prev_spc_count is not None and
172 prev_spc_count == spc_count and
173 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
174 res[-1] += " fallthrough" # add to previous line
175 prev_spc_count = spc_count
176 else:
177 #print ("notstarts", spc_count, nwhite)
178 prev_spc_count = None
179 res.append(l)
180 return '\n'.join(res)
181
182
183 # Track the indentation level and emit the right INDENT / DEDENT events.
184 def indentation_filter(tokens):
185 # A stack of indentation levels; will never pop item 0
186 levels = [0]
187 token = None
188 depth = 0
189 prev_was_ws = False
190 for token in tokens:
191 if 0:
192 print("Process", depth, token.indent, token,)
193 if token.at_line_start:
194 print("at_line_start",)
195 if token.must_indent:
196 print("must_indent",)
197 print
198
199 # WS only occurs at the start of the line
200 # There may be WS followed by NEWLINE so
201 # only track the depth here. Don't indent/dedent
202 # until there's something real.
203 if token.type == "WS":
204 assert depth == 0
205 depth = len(token.value)
206 prev_was_ws = True
207 # WS tokens are never passed to the parser
208 continue
209
210 if token.type == "NEWLINE":
211 depth = 0
212 if prev_was_ws or token.at_line_start:
213 # ignore blank lines
214 continue
215 # pass the other cases on through
216 yield token
217 continue
218
219 # then it must be a real token (not WS, not NEWLINE)
220 # which can affect the indentation level
221
222 prev_was_ws = False
223 if token.must_indent:
224 # The current depth must be larger than the previous level
225 if not (depth > levels[-1]):
226 raise IndentationError("expected an indented block")
227
228 levels.append(depth)
229 yield INDENT(token.lineno)
230
231 elif token.at_line_start:
232 # Must be on the same level or one of the previous levels
233 if depth == levels[-1]:
234 # At the same level
235 pass
236 elif depth > levels[-1]:
237 raise IndentationError("indent increase but not in new block")
238 else:
239 # Back up; but only if it matches a previous level
240 try:
241 i = levels.index(depth)
242 except ValueError:
243 raise IndentationError("inconsistent indentation")
244 for _ in range(i+1, len(levels)):
245 yield DEDENT(token.lineno)
246 levels.pop()
247
248 yield token
249
250 ### Finished processing ###
251
252 # Must dedent any remaining levels
253 if len(levels) > 1:
254 assert token is not None
255 for _ in range(1, len(levels)):
256 yield DEDENT(token.lineno)
257
258
259 # The top-level filter adds an ENDMARKER, if requested.
260 # Python's grammar uses it.
261 def filter(lexer, add_endmarker=True):
262 token = None
263 tokens = iter(lexer.token, None)
264 tokens = python_colonify(lexer, tokens)
265 tokens = track_tokens_filter(lexer, tokens)
266 for token in indentation_filter(tokens):
267 yield token
268
269 if add_endmarker:
270 lineno = 1
271 if token is not None:
272 lineno = token.lineno
273 yield _new_token("ENDMARKER", lineno)
274
275 ##### Lexer ######
276
277
278 class PowerLexer:
279 tokens = (
280 'DEF',
281 'IF',
282 'THEN',
283 'ELSE',
284 'FOR',
285 'TO',
286 'DO',
287 'WHILE',
288 'BREAK',
289 'NAME',
290 'HEX', # hex numbers
291 'NUMBER', # Python decimals
292 'BINARY', # Python binary
293 'STRING', # single quoted strings only; syntax of raw strings
294 'LPAR',
295 'RPAR',
296 'LBRACK',
297 'RBRACK',
298 'COLON',
299 'EQ',
300 'ASSIGNEA',
301 'ASSIGN',
302 'LTU',
303 'GTU',
304 'NE',
305 'LE',
306 'GE',
307 'LT',
308 'GT',
309 'PLUS',
310 'MINUS',
311 'MULT',
312 'DIV',
313 'MOD',
314 'INVERT',
315 'APPEND',
316 'BITOR',
317 'BITAND',
318 'BITXOR',
319 'RETURN',
320 'SWITCH',
321 'CASE',
322 'DEFAULT',
323 'WS',
324 'NEWLINE',
325 'COMMA',
326 'SEMICOLON',
327 'INDENT',
328 'DEDENT',
329 'ENDMARKER',
330 )
331
332 # Build the lexer
333 def build(self, **kwargs):
334 self.lexer = lex.lex(module=self, **kwargs)
335
336 def t_HEX(self, t):
337 r"""0x[0-9a-fA-F_]+"""
338 val = t.value.replace("_", "")
339 t.value = SelectableInt(int(val, 16), (len(val)-2)*4) # hex = nibble
340 return t
341
342 def t_BINARY(self, t):
343 r"""0b[01]+"""
344 t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
345 return t
346
347 #t_NUMBER = r'\d+'
348 # taken from decmial.py but without the leading sign
349 def t_NUMBER(self, t):
350 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
351 t.value = int(t.value)
352 return t
353
354 def t_STRING(self, t):
355 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
356 print(repr(t.value))
357 t.value = t.value[1:-1]
358 return t
359
360 t_COLON = r':'
361 t_EQ = r'='
362 t_ASSIGNEA = r'<-iea'
363 t_ASSIGN = r'<-'
364 t_LTU = r'<u'
365 t_GTU = r'>u'
366 t_NE = r'!='
367 t_LE = r'<='
368 t_GE = r'>='
369 t_LT = r'<'
370 t_GT = r'>'
371 t_PLUS = r'\+'
372 t_MINUS = r'-'
373 t_MULT = r'\*'
374 t_DIV = r'/'
375 t_MOD = r'%'
376 t_INVERT = r'¬'
377 t_COMMA = r','
378 t_SEMICOLON = r';'
379 t_APPEND = r'\|\|'
380 t_BITOR = r'\|'
381 t_BITAND = r'\&'
382 t_BITXOR = r'\^'
383
384 # Ply nicely documented how to do this.
385
386 RESERVED = {
387 "def": "DEF",
388 "if": "IF",
389 "then": "THEN",
390 "else": "ELSE",
391 "leave": "BREAK",
392 "for": "FOR",
393 "to": "TO",
394 "while": "WHILE",
395 "do": "DO",
396 "return": "RETURN",
397 "switch": "SWITCH",
398 "case": "CASE",
399 "default": "DEFAULT",
400 }
401
402 def t_NAME(self, t):
403 r'[a-zA-Z_][a-zA-Z0-9_]*'
404 t.type = self.RESERVED.get(t.value, "NAME")
405 return t
406
407 # Putting this before t_WS let it consume lines with only comments in
408 # them so the latter code never sees the WS part. Not consuming the
409 # newline. Needed for "if 1: #comment"
410 def t_comment(self, t):
411 r"[ ]*\043[^\n]*" # \043 is '#'
412 pass
413
414 # Whitespace
415
416 def t_WS(self, t):
417 r'[ ]+'
418 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
419 t.lexer.brack_count == 0:
420 return t
421
422 # Don't generate newline tokens when inside of parenthesis, eg
423 # a = (1,
424 # 2, 3)
425 def t_newline(self, t):
426 r'\n+'
427 t.lexer.lineno += len(t.value)
428 t.type = "NEWLINE"
429 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
430 return t
431
432 def t_LBRACK(self, t):
433 r'\['
434 t.lexer.brack_count += 1
435 return t
436
437 def t_RBRACK(self, t):
438 r'\]'
439 # check for underflow? should be the job of the parser
440 t.lexer.brack_count -= 1
441 return t
442
443 def t_LPAR(self, t):
444 r'\('
445 t.lexer.paren_count += 1
446 return t
447
448 def t_RPAR(self, t):
449 r'\)'
450 # check for underflow? should be the job of the parser
451 t.lexer.paren_count -= 1
452 return t
453
454 #t_ignore = " "
455
456 def t_error(self, t):
457 raise SyntaxError("Unknown symbol %r" % (t.value[0],))
458 print("Skipping", repr(t.value[0]))
459 t.lexer.skip(1)
460
461
462 # Combine Ply and my filters into a new lexer
463
464 class IndentLexer(PowerLexer):
465 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
466 self.debug = debug
467 self.build(debug=debug, optimize=optimize,
468 lextab=lextab, reflags=reflags)
469 self.token_stream = None
470
471 def input(self, s, add_endmarker=True):
472 s = annoying_case_hack_filter(s)
473 if self.debug:
474 print(s)
475 s += "\n"
476 self.lexer.paren_count = 0
477 self.lexer.brack_count = 0
478 self.lexer.lineno = 1
479 self.lexer.input(s)
480 self.token_stream = filter(self.lexer, add_endmarker)
481
482 def token(self):
483 try:
484 return next(self.token_stream)
485 except StopIteration:
486 return None
487
488
489 switchtest = """
490 switch (n)
491 case(1): x <- 5
492 case(3): x <- 2
493 case(2):
494
495 case(4):
496 x <- 3
497 case(9):
498
499 default:
500 x <- 9
501 print (5)
502 """
503
504 cnttzd = """
505 n <- 0
506 do while n < 64
507 if (RS)[63-n] = 0b1 then
508 leave
509 n <- n + 1
510 RA <- EXTZ64(n)
511 print (RA)
512 """
513
514 if __name__ == '__main__':
515
516 # quick test/demo
517 #code = cnttzd
518 code = switchtest
519 print(code)
520
521 lexer = IndentLexer(debug=1)
522 # Give the lexer some input
523 print("code")
524 print(code)
525 lexer.input(code)
526
527 tokens = iter(lexer.token, None)
528 for token in tokens:
529 print(token)