separate ISAPages out from inherited ISA Class
[openpower-isa.git] / src / openpower / decoder / pseudo / lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11 from copy import copy
12 from ply import lex
13 from openpower.decoder.selectable_int import SelectableInt
14
15
16 def raise_syntax_error(msg, filename, lineno, lexpos, input_text):
17 line_start = input_text.rfind('\n', 0, lexpos) + 1
18 line_end = input_text.find('\n', line_start)
19 col = (lexpos - line_start) + 1
20 raise SyntaxError(str(msg), (filename, lineno, col,
21 input_text[line_start:line_end]))
22
23 # I implemented INDENT / DEDENT generation as a post-processing filter
24
25 # The original lex token stream contains WS and NEWLINE characters.
26 # WS will only occur before any other tokens on a line.
27
28 # I have three filters. One tags tokens by adding two attributes.
29 # "must_indent" is True if the token must be indented from the
30 # previous code. The other is "at_line_start" which is True for WS
31 # and the first non-WS/non-NEWLINE on a line. It flags the check so
32 # see if the new line has changed indication level.
33
34 # Python's syntax has three INDENT states
35 # 0) no colon hence no need to indent
36 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
37 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
38 NO_INDENT = 0
39 MAY_INDENT = 1
40 MUST_INDENT = 2
41
42 # turn into python-like colon syntax from pseudo-code syntax.
43 # identify tokens which tell us whether a "hidden colon" is needed.
44 # this in turn means that track_tokens_filter "works" without needing
45 # complex grammar rules
46
47
48 def python_colonify(lexer, tokens):
49
50 implied_colon_needed = False
51 for token in tokens:
52 #print ("track colon token", token, token.type)
53
54 if token.type == 'THEN':
55 # turn then into colon
56 token.type = "COLON"
57 yield token
58 elif token.type == 'ELSE':
59 yield token
60 token = copy(token)
61 token.type = "COLON"
62 yield token
63 elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
64 implied_colon_needed = True
65 yield token
66 elif token.type == 'NEWLINE':
67 if implied_colon_needed:
68 ctok = copy(token)
69 ctok.type = "COLON"
70 yield ctok
71 implied_colon_needed = False
72 yield token
73 else:
74 yield token
75
76
77 # only care about whitespace at the start of a line
78 def track_tokens_filter(lexer, tokens):
79 oldignore = lexer.lexignore
80 lexer.at_line_start = at_line_start = True
81 indent = NO_INDENT
82 saw_colon = False
83 for token in tokens:
84 #print ("track token", token, token.type)
85 token.at_line_start = at_line_start
86
87 if token.type == "COLON":
88 at_line_start = False
89 indent = MAY_INDENT
90 token.must_indent = False
91
92 elif token.type == "NEWLINE":
93 at_line_start = True
94 if indent == MAY_INDENT:
95 indent = MUST_INDENT
96 token.must_indent = False
97
98 elif token.type == "WS":
99 assert token.at_line_start == True
100 at_line_start = True
101 token.must_indent = False
102
103 else:
104 # A real token; only indent after COLON NEWLINE
105 if indent == MUST_INDENT:
106 token.must_indent = True
107 else:
108 token.must_indent = False
109 at_line_start = False
110 indent = NO_INDENT
111
112 # really bad hack that changes ignore lexer state.
113 # when "must indent" is seen (basically "real tokens" seen)
114 # then ignore whitespace.
115 if token.must_indent:
116 lexer.lexignore = ('ignore', ' ')
117 else:
118 lexer.lexignore = oldignore
119
120 token.indent = indent
121 yield token
122 lexer.at_line_start = at_line_start
123
124
125 def _new_token(type, lineno):
126 tok = lex.LexToken()
127 tok.type = type
128 tok.value = None
129 tok.lineno = lineno
130 tok.lexpos = -1
131 return tok
132
133 # Synthesize a DEDENT tag
134
135
136 def DEDENT(lineno):
137 return _new_token("DEDENT", lineno)
138
139 # Synthesize an INDENT tag
140
141
142 def INDENT(lineno):
143 return _new_token("INDENT", lineno)
144
145
146 def count_spaces(l):
147 for i in range(len(l)):
148 if l[i] != ' ':
149 return i
150 return 0
151
152
153 def annoying_case_hack_filter(code):
154 """add annoying "silent keyword" (fallthrough)
155
156 this which tricks the parser into taking the (silent) case statement
157 as a "small expression". it can then be spotted and used to indicate
158 "fall through" to the next case (in the parser)
159
160 also skips blank lines
161
162 bugs: any function that starts with the letters "case" or "default"
163 will be detected erroneously. fixing that involves doing a token
164 lexer which spots the fact that "case" and "default" are words,
165 separating them from space, colon, bracket etc.
166
167 http://bugs.libre-riscv.org/show_bug.cgi?id=280
168 """
169 res = []
170 prev_spc_count = None
171 for l in code.split("\n"):
172 spc_count = count_spaces(l)
173 nwhite = l[spc_count:]
174 if len(nwhite) == 0: # skip blank lines
175 res.append('')
176 continue
177 if nwhite.startswith("case") or nwhite.startswith("default"):
178 #print ("case/default", nwhite, spc_count, prev_spc_count)
179 if (prev_spc_count is not None and
180 prev_spc_count == spc_count and
181 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
182 res[-1] += " fallthrough" # add to previous line
183 prev_spc_count = spc_count
184 else:
185 #print ("notstarts", spc_count, nwhite)
186 prev_spc_count = None
187 res.append(l)
188 return '\n'.join(res)
189
190
191 # Track the indentation level and emit the right INDENT / DEDENT events.
192 def indentation_filter(tokens):
193 # A stack of indentation levels; will never pop item 0
194 levels = [0]
195 token = None
196 depth = 0
197 prev_was_ws = False
198 for token in tokens:
199 if 0:
200 print("Process", depth, token.indent, token,)
201 if token.at_line_start:
202 print("at_line_start",)
203 if token.must_indent:
204 print("must_indent",)
205 print
206
207 # WS only occurs at the start of the line
208 # There may be WS followed by NEWLINE so
209 # only track the depth here. Don't indent/dedent
210 # until there's something real.
211 if token.type == "WS":
212 assert depth == 0
213 depth = len(token.value)
214 prev_was_ws = True
215 # WS tokens are never passed to the parser
216 continue
217
218 if token.type == "NEWLINE":
219 depth = 0
220 if prev_was_ws or token.at_line_start:
221 # ignore blank lines
222 continue
223 # pass the other cases on through
224 yield token
225 continue
226
227 # then it must be a real token (not WS, not NEWLINE)
228 # which can affect the indentation level
229
230 prev_was_ws = False
231 if token.must_indent:
232 # The current depth must be larger than the previous level
233 if not (depth > levels[-1]):
234 raise IndentationError("expected an indented block")
235
236 levels.append(depth)
237 yield INDENT(token.lineno)
238
239 elif token.at_line_start:
240 # Must be on the same level or one of the previous levels
241 if depth == levels[-1]:
242 # At the same level
243 pass
244 elif depth > levels[-1]:
245 raise IndentationError("indent increase but not in new block")
246 else:
247 # Back up; but only if it matches a previous level
248 try:
249 i = levels.index(depth)
250 except ValueError:
251 raise IndentationError("inconsistent indentation")
252 for _ in range(i+1, len(levels)):
253 yield DEDENT(token.lineno)
254 levels.pop()
255
256 yield token
257
258 ### Finished processing ###
259
260 # Must dedent any remaining levels
261 if len(levels) > 1:
262 assert token is not None
263 for _ in range(1, len(levels)):
264 yield DEDENT(token.lineno)
265
266
267 # The top-level filter adds an ENDMARKER, if requested.
268 # Python's grammar uses it.
269 def filter(lexer, add_endmarker=True):
270 token = None
271 tokens = iter(lexer.token, None)
272 tokens = python_colonify(lexer, tokens)
273 tokens = track_tokens_filter(lexer, tokens)
274 for token in indentation_filter(tokens):
275 yield token
276
277 if add_endmarker:
278 lineno = 1
279 if token is not None:
280 lineno = token.lineno
281 yield _new_token("ENDMARKER", lineno)
282
283 ##### Lexer ######
284
285
286 class PowerLexer:
287 tokens = (
288 'DEF',
289 'IF',
290 'THEN',
291 'ELSE',
292 'FOR',
293 'TO',
294 'DO',
295 'WHILE',
296 'BREAK',
297 'NAME',
298 'HEX', # hex numbers
299 'NUMBER', # Python decimals
300 'BINARY', # Python binary
301 'STRING', # single quoted strings only; syntax of raw strings
302 'LPAR',
303 'RPAR',
304 'LBRACK',
305 'RBRACK',
306 'COLON',
307 'EQ',
308 'ASSIGNEA',
309 'ASSIGN',
310 'LTU',
311 'GTU',
312 'NE',
313 'LE',
314 'GE',
315 'LT',
316 'GT',
317 'PLUS',
318 'MINUS',
319 'MULT',
320 'DIV',
321 'MOD',
322 'INVERT',
323 'APPEND',
324 'BITOR',
325 'BITAND',
326 'BITXOR',
327 'RETURN',
328 'SWITCH',
329 'CASE',
330 'DEFAULT',
331 'WS',
332 'NEWLINE',
333 'COMMA',
334 'SEMICOLON',
335 'INDENT',
336 'DEDENT',
337 'ENDMARKER',
338 )
339
340 # Build the lexer
341 def build(self, **kwargs):
342 self.lexer = lex.lex(module=self, **kwargs)
343 self.filename = None
344
345 def t_HEX(self, t):
346 r"""0x[0-9a-fA-F_]+"""
347 val = t.value.replace("_", "")
348 t.value = SelectableInt(int(val, 16), (len(val)-2)*4) # hex = nibble
349 return t
350
351 def t_BINARY(self, t):
352 r"""0b[01]+"""
353 t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
354 return t
355
356 #t_NUMBER = r'\d+'
357 # taken from decmial.py but without the leading sign
358 def t_NUMBER(self, t):
359 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
360 t.value = int(t.value)
361 return t
362
363 def t_STRING(self, t):
364 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ...
365 print(repr(t.value))
366 t.value = t.value[1:-1]
367 return t
368
369 t_COLON = r':'
370 t_EQ = r'='
371 t_ASSIGNEA = r'<-iea'
372 t_ASSIGN = r'<-'
373 t_LTU = r'<u'
374 t_GTU = r'>u'
375 t_NE = r'!='
376 t_LE = r'<='
377 t_GE = r'>='
378 t_LT = r'<'
379 t_GT = r'>'
380 t_PLUS = r'\+'
381 t_MINUS = r'-'
382 t_MULT = r'\*'
383 t_DIV = r'/'
384 t_MOD = r'%'
385 t_INVERT = r'¬'
386 t_COMMA = r','
387 t_SEMICOLON = r';'
388 t_APPEND = r'\|\|'
389 t_BITOR = r'\|'
390 t_BITAND = r'\&'
391 t_BITXOR = r'\^'
392
393 # Ply nicely documented how to do this.
394
395 RESERVED = {
396 "def": "DEF",
397 "if": "IF",
398 "then": "THEN",
399 "else": "ELSE",
400 "leave": "BREAK",
401 "for": "FOR",
402 "to": "TO",
403 "while": "WHILE",
404 "do": "DO",
405 "return": "RETURN",
406 "switch": "SWITCH",
407 "case": "CASE",
408 "default": "DEFAULT",
409 }
410
411 def t_NAME(self, t):
412 r'[a-zA-Z_][a-zA-Z0-9_]*'
413 t.type = self.RESERVED.get(t.value, "NAME")
414 return t
415
416 # Putting this before t_WS let it consume lines with only comments in
417 # them so the latter code never sees the WS part. Not consuming the
418 # newline. Needed for "if 1: #comment"
419 def t_comment(self, t):
420 r"[ ]*\043[^\n]*" # \043 is '#'
421 pass
422
423 # Whitespace
424
425 def t_WS(self, t):
426 r'[ ]+'
427 if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
428 t.lexer.brack_count == 0:
429 return t
430
431 # Don't generate newline tokens when inside of parenthesis, eg
432 # a = (1,
433 # 2, 3)
434 def t_newline(self, t):
435 r'\n+'
436 t.lexer.lineno += len(t.value)
437 t.type = "NEWLINE"
438 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
439 return t
440
441 def t_LBRACK(self, t):
442 r'\['
443 t.lexer.brack_count += 1
444 return t
445
446 def t_RBRACK(self, t):
447 r'\]'
448 # check for underflow? should be the job of the parser
449 t.lexer.brack_count -= 1
450 return t
451
452 def t_LPAR(self, t):
453 r'\('
454 t.lexer.paren_count += 1
455 return t
456
457 def t_RPAR(self, t):
458 r'\)'
459 # check for underflow? should be the job of the parser
460 t.lexer.paren_count -= 1
461 return t
462
463 #t_ignore = " "
464
465 def t_error(self, t):
466 raise_syntax_error("Unknown symbol %r" % (t.value[0],),
467 self.filename, t.lexer.lineno,
468 t.lexer.lexpos, t.lexer.lexdata)
469 print("Skipping", repr(t.value[0]))
470 t.lexer.skip(1)
471
472
473 # Combine Ply and my filters into a new lexer
474
475 class IndentLexer(PowerLexer):
476 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
477 self.debug = debug
478 self.build(debug=debug, optimize=optimize,
479 lextab=lextab, reflags=reflags)
480 self.token_stream = None
481
482 def input(self, s, add_endmarker=True):
483 s = annoying_case_hack_filter(s)
484 if self.debug:
485 print(s)
486 s += "\n"
487 self.lexer.paren_count = 0
488 self.lexer.brack_count = 0
489 self.lexer.lineno = 1
490 self.lexer.input(s)
491 self.token_stream = filter(self.lexer, add_endmarker)
492
493 def token(self):
494 try:
495 return next(self.token_stream)
496 except StopIteration:
497 return None
498
499
500 switchtest = """
501 switch (n)
502 case(1): x <- 5
503 case(3): x <- 2
504 case(2):
505
506 case(4):
507 x <- 3
508 case(9):
509
510 default:
511 x <- 9
512 print (5)
513 """
514
515 cnttzd = """
516 n <- 0
517 do while n < 64
518 if (RS)[63-n] = 0b1 then
519 leave
520 n <- n + 1
521 RA <- EXTZ64(n)
522 print (RA)
523 """
524
525 if __name__ == '__main__':
526
527 # quick test/demo
528 #code = cnttzd
529 code = switchtest
530 print(code)
531
532 lexer = IndentLexer(debug=1)
533 # Give the lexer some input
534 print("code")
535 print(code)
536 lexer.input(code)
537
538 tokens = iter(lexer.token, None)
539 for token in tokens:
540 print(token)