oppc/code: rename oppc_int to oppc_value
[openpower-isa.git] / src / openpower / oppc / pc_lexer.py
1 # Based on GardenSnake - a parser generator demonstration program
2 # GardenSnake was released into the Public Domain by Andrew Dalke.
3
4 # Portions of this work are derived from Python's Grammar definition
5 # and may be covered under the Python copyright and license
6 #
7 # Andrew Dalke / Dalke Scientific Software, LLC
8 # 30 August 2006 / Cape Town, South Africa
9
10 # Modifications for inclusion in PLY distribution
11
12 from copy import copy
13
14 from ply import lex
15
16 import openpower.oppc.pc_ast as pc_ast
17
18
19 def bind(cls):
20 def wrapper(self, t):
21 t.value = cls(t.value)
22 return t
23
24 wrapper.__name__ = cls.__name__
25 wrapper.__doc__ = cls.__doc__
26
27 return wrapper
28
29
30 class SyntaxError2(Exception):
31 """ class used to raise a syntax error but get ply to stop eating errors
32 since it catches and discards SyntaxError after setting a flag.
33 """
34
35 def __init__(self, *args, cls=SyntaxError):
36 super().__init__(*args)
37 self.cls = cls
38
39 def __repr__(self):
40 return repr(self.cls(*self.args))
41
42 def __str__(self):
43 return str(self.cls(*self.args))
44
45 def raise_syntax_error(self):
46 raise self.cls(*self.args) from self
47
48
49 def raise_syntax_error(msg, filename, lineno, lexpos, input_text,
50 cls=SyntaxError):
51 line_start = input_text.rfind("\n", 0, lexpos) + 1
52 line_end = input_text.find("\n", line_start)
53 col = (lexpos - line_start) + 1
54 raise SyntaxError2(str(msg), (filename, lineno, col,
55 input_text[line_start:line_end]), cls=cls)
56
57 # I implemented INDENT / DEDENT generation as a post-processing filter
58
59 # The original lex token stream contains WS and NEWLINE characters.
60 # WS will only occur before any other tokens on a line.
61
62 # I have three filters. One tags tokens by adding two attributes.
63 # "must_indent" is True if the token must be indented from the
64 # previous code. The other is "at_line_start" which is True for WS
65 # and the first non-WS/non-NEWLINE on a line. It flags the check so
66 # see if the new line has changed indication level.
67
68 # Python's syntax has three INDENT states
69 # 0) no colon hence no need to indent
70 # 1) "if 1: go()" - simple statements have a COLON but no need for an indent
71 # 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent
72 NO_INDENT = 0
73 MAY_INDENT = 1
74 MUST_INDENT = 2
75
76 # turn into python-like colon syntax from pseudo-code syntax.
77 # identify tokens which tell us whether a "hidden colon" is needed.
78 # this in turn means that track_tokens_filter "works" without needing
79 # complex grammar rules
80
81
82 def python_colonify(lexer, tokens):
83 implied_colon_needed = False
84 for token in tokens:
85 if token.type == "THEN":
86 # turn then into colon
87 token.type = "COLON"
88 token.value = pc_ast.Colon(str(token.value))
89 yield token
90 elif token.type == "ELSE":
91 yield token
92 token = copy(token)
93 token.type = "COLON"
94 token.value = pc_ast.Colon(str(token.value))
95 yield token
96 elif token.type in ["DO", "WHILE", "FOR", "SWITCH"]:
97 implied_colon_needed = True
98 yield token
99 elif token.type == "NEWLINE":
100 if implied_colon_needed:
101 ctok = copy(token)
102 ctok.type = "COLON"
103 ctok.value = pc_ast.Colon(str(token.value))
104 yield ctok
105 implied_colon_needed = False
106 yield token
107 else:
108 yield token
109
110
111 # only care about whitespace at the start of a line
112 def track_tokens_filter(lexer, tokens):
113 oldignore = lexer.lexignore
114 lexer.at_line_start = at_line_start = True
115 indent = NO_INDENT
116 saw_colon = False
117 for token in tokens:
118 token.at_line_start = at_line_start
119
120 if token.type == "COLON":
121 at_line_start = False
122 indent = MAY_INDENT
123 token.must_indent = False
124
125 elif token.type == "NEWLINE":
126 at_line_start = True
127 if indent == MAY_INDENT:
128 indent = MUST_INDENT
129 token.must_indent = False
130
131 elif token.type == "WS":
132 assert token.at_line_start == True
133 at_line_start = True
134 token.must_indent = False
135
136 else:
137 # A real token; only indent after COLON NEWLINE
138 if indent == MUST_INDENT:
139 token.must_indent = True
140 else:
141 token.must_indent = False
142 at_line_start = False
143 indent = NO_INDENT
144
145 # really bad hack that changes ignore lexer state.
146 # when "must indent" is seen (basically "real tokens" seen)
147 # then ignore whitespace.
148 if token.must_indent:
149 lexer.lexignore = ("ignore", " ")
150 else:
151 lexer.lexignore = oldignore
152
153 token.indent = indent
154 yield token
155 lexer.at_line_start = at_line_start
156
157
158 def _new_token(type, lineno):
159 cls = {
160 "ENDMARKER": pc_ast.Endmarker,
161 "INDENT": pc_ast.Indent,
162 "DEDENT": pc_ast.Dedent,
163 }[type]
164 tok = lex.LexToken()
165 tok.type = type
166 tok.value = cls()
167 tok.lineno = lineno
168 tok.lexpos = -1
169 return tok
170
171 # Synthesize a DEDENT tag
172
173
174 def DEDENT(lineno):
175 return _new_token("DEDENT", lineno)
176
177 # Synthesize an INDENT tag
178
179
180 def INDENT(lineno):
181 return _new_token("INDENT", lineno)
182
183
184 def count_spaces(l):
185 for i in range(len(l)):
186 if l[i] != " ":
187 return i
188 return 0
189
190
191 def annoying_case_hack_filter(code):
192 """add annoying "silent keyword" (fallthrough)
193
194 this which tricks the parser into taking the (silent) case statement
195 as a "small expression". it can then be spotted and used to indicate
196 "fall through" to the next case (in the parser)
197
198 also skips blank lines
199
200 bugs: any function that starts with the letters "case" or "default"
201 will be detected erroneously. fixing that involves doing a token
202 lexer which spots the fact that "case" and "default" are words,
203 separating them from space, colon, bracket etc.
204
205 http://bugs.libre-riscv.org/show_bug.cgi?id=280
206 """
207 res = []
208 prev_spc_count = None
209 for l in code.split("\n"):
210 spc_count = count_spaces(l)
211 nwhite = l[spc_count:]
212 if len(nwhite) == 0: # skip blank lines
213 res.append("")
214 continue
215 if nwhite.startswith("case") or nwhite.startswith("default"):
216 if (prev_spc_count is not None and
217 prev_spc_count == spc_count and
218 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
219 res[-1] += " fallthrough" # add to previous line
220 prev_spc_count = spc_count
221 else:
222 prev_spc_count = None
223 res.append(l)
224 return "\n".join(res)
225
226
227 # Track the indentation level and emit the right INDENT / DEDENT events.
228 def indentation_filter(tokens, filename):
229 # A stack of indentation levels; will never pop item 0
230 levels = [0]
231 token = None
232 depth = 0
233 prev_was_ws = False
234 for token in tokens:
235 # WS only occurs at the start of the line
236 # There may be WS followed by NEWLINE so
237 # only track the depth here. Don't indent/dedent
238 # until there's something real.
239 if token.type == "WS":
240 assert depth == 0
241 depth = len(token.value)
242 prev_was_ws = True
243 # WS tokens are never passed to the parser
244 continue
245
246 if token.type == "NEWLINE":
247 depth = 0
248 if prev_was_ws or token.at_line_start:
249 # ignore blank lines
250 continue
251 # pass the other cases on through
252 yield token
253 continue
254
255 # then it must be a real token (not WS, not NEWLINE)
256 # which can affect the indentation level
257
258 prev_was_ws = False
259 if token.must_indent:
260 # The current depth must be larger than the previous level
261 if not (depth > levels[-1]):
262 raise_syntax_error("expected an indented block",
263 filename, token.lexer.lineno,
264 token.lexer.lexpos, token.lexer.lexdata,
265 cls=IndentationError)
266
267 levels.append(depth)
268 yield INDENT(token.lineno)
269
270 elif token.at_line_start:
271 # Must be on the same level or one of the previous levels
272 if depth == levels[-1]:
273 # At the same level
274 pass
275 elif depth > levels[-1]:
276 raise_syntax_error("indent increase but not in new block",
277 filename, token.lexer.lineno,
278 token.lexer.lexpos, token.lexer.lexdata,
279 cls=IndentationError)
280 else:
281 # Back up; but only if it matches a previous level
282 try:
283 i = levels.index(depth)
284 except ValueError:
285 raise_syntax_error("inconsistent indentation",
286 filename, token.lexer.lineno,
287 token.lexer.lexpos, token.lexer.lexdata,
288 cls=IndentationError)
289 for _ in range(i+1, len(levels)):
290 yield DEDENT(token.lineno)
291 levels.pop()
292
293 yield token
294
295 ### Finished processing ###
296
297 # Must dedent any remaining levels
298 if len(levels) > 1:
299 assert token is not None
300 for _ in range(1, len(levels)):
301 yield DEDENT(token.lineno)
302
303
304 # The top-level filter adds an ENDMARKER, if requested.
305 # Python's grammar uses it.
306 def filter(lexer, add_endmarker, filename):
307 token = None
308 tokens = iter(lexer.token, None)
309 tokens = python_colonify(lexer, tokens)
310 tokens = track_tokens_filter(lexer, tokens)
311 for token in indentation_filter(tokens, filename):
312 yield token
313
314 if add_endmarker:
315 lineno = 1
316 if token is not None:
317 lineno = token.lineno
318 yield _new_token("ENDMARKER", lineno)
319
320
321 ##### Lexer ######
322 class Lexer:
323 tokens = (
324 "DEF",
325 "IF",
326 "THEN",
327 "ELSE",
328 "FOR",
329 "TO",
330 "DO",
331 "WHILE",
332 "BREAK",
333 "NAME",
334 "HEX", # hex numbers
335 "NUMBER", # Python decimals
336 "BINARY", # Python binary
337 "STRING", # single quoted strings only; syntax of raw strings
338 "LPAR",
339 "RPAR",
340 "LBRACK",
341 "RBRACK",
342 "COLON",
343 "EQ",
344 "ASSIGNEA",
345 "ASSIGN",
346 "LTU",
347 "GTU",
348 "NE",
349 "LE",
350 "LSHIFT",
351 "RSHIFT",
352 "GE",
353 "LT",
354 "GT",
355 "PLUS",
356 "MINUS",
357 "MULT",
358 "DIV",
359 "MOD",
360 "INVERT",
361 "APPEND",
362 "BITOR",
363 "BITAND",
364 "BITXOR",
365 "RETURN",
366 "SWITCH",
367 "CASE",
368 "DEFAULT",
369 "WS",
370 "NEWLINE",
371 "COMMA",
372 "QMARK",
373 "PERIOD",
374 "SEMICOLON",
375 "INDENT",
376 "DEDENT",
377 "ENDMARKER",
378 )
379
380 # Build the lexer
381 def build(self, **kwargs):
382 self.lexer = lex.lex(module=self, **kwargs)
383 self.filename = None
384
385 @lex.TOKEN(pc_ast.HexLiteral.__doc__)
386 def t_HEX(self, t):
387 t.value = pc_ast.HexLiteral(t.value)
388 return t
389
390 @lex.TOKEN(pc_ast.BinLiteral.__doc__)
391 def t_BINARY(self, t):
392 t.value = pc_ast.BinLiteral(t.value)
393 return t
394
395 @lex.TOKEN(pc_ast.DecLiteral.__doc__)
396 def t_NUMBER(self, t):
397 t.value = pc_ast.DecLiteral(t.value)
398 return t
399
400 @lex.TOKEN(pc_ast.StringLiteral.__doc__)
401 def t_STRING(self, t):
402 t.value = pc_ast.StringLiteral(t.value[1:-1])
403 return t
404
405 t_COLON = pc_ast.Colon.__doc__
406 t_EQ = pc_ast.Eq.__doc__
407 t_ASSIGNEA = pc_ast.AssignIEAOp.__doc__
408 t_ASSIGN = pc_ast.AssignOp.__doc__
409 t_LTU = pc_ast.LtU.__doc__
410 t_GTU = pc_ast.GtU.__doc__
411 t_NE = pc_ast.NotEq.__doc__
412 t_LE = pc_ast.Le.__doc__
413 t_GE = pc_ast.Ge.__doc__
414 t_LSHIFT = pc_ast.LShift.__doc__
415 t_RSHIFT = pc_ast.RShift.__doc__
416 t_LT = pc_ast.Lt.__doc__
417 t_GT = pc_ast.Gt.__doc__
418 t_PLUS = pc_ast.Add.__doc__
419 t_MINUS = pc_ast.Sub.__doc__
420 t_MULT = pc_ast.Mul.__doc__
421 t_DIV = pc_ast.Div.__doc__
422 t_MOD = pc_ast.Mod.__doc__
423 t_INVERT = pc_ast.Not.__doc__
424 t_COMMA = pc_ast.Comma.__doc__
425 t_PERIOD = pc_ast.Period.__doc__
426 t_SEMICOLON = pc_ast.Semicolon.__doc__
427 t_APPEND = pc_ast.BitConcat.__doc__
428 t_BITOR = pc_ast.BitOr.__doc__
429 t_BITAND = pc_ast.BitAnd.__doc__
430 t_BITXOR = pc_ast.BitXor.__doc__
431 t_QMARK = pc_ast.Question.__doc__
432
433 @lex.TOKEN(pc_ast.Symbol)
434 def t_NAME(self, t):
435 keywords = {
436 "def": ("DEF", pc_ast.FunctionKeyword),
437 "if": ("IF", pc_ast.IfKeyword),
438 "then": ("THEN", pc_ast.ThenKeyword),
439 "else": ("ELSE", pc_ast.ElseKeyword),
440 "leave": ("BREAK", pc_ast.LeaveKeyword),
441 "for": ("FOR", pc_ast.ForKeyword),
442 "to": ("TO", pc_ast.ToKeyword),
443 "while": ("WHILE", pc_ast.WhileKeyword),
444 "do": ("DO", pc_ast.DoKeyword),
445 "return": ("RETURN", pc_ast.ReturnKeyword),
446 "switch": ("SWITCH", pc_ast.SwitchKeyword),
447 "case": ("CASE", pc_ast.CaseKeyword),
448 "default": ("DEFAULT", pc_ast.DefaultKeyword),
449 }
450 (tt, tcls) = keywords.get(t.value, ("NAME", pc_ast.Symbol))
451 t.type = tt
452 t.value = tcls(t.value)
453 return t
454
455 # Putting this before t_WS let it consume lines with only comments in
456 # them so the latter code never sees the WS part. Not consuming the
457 # newline. Needed for "if 1: #comment"
458 @lex.TOKEN(pc_ast.Comment.__doc__)
459 def t_comment(self, t):
460 return None
461
462 # Whitespace
463 @lex.TOKEN(pc_ast.Whitespace.__doc__)
464 def t_WS(self, t):
465 if (t.lexer.at_line_start and
466 t.lexer.paren_count == 0 and \
467 t.lexer.brack_count == 0):
468 return t
469
470 # Don't generate newline tokens when inside of parenthesis, eg
471 # a = (1,
472 # 2, 3)
473 @lex.TOKEN(pc_ast.Linebreak.__doc__)
474 def t_newline(self, t):
475 t.lexer.lineno += len(t.value)
476 t.value = pc_ast.Linebreak(t.value)
477 t.type = "NEWLINE"
478 if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
479 return t
480
481 @lex.TOKEN(pc_ast.LBracket.__doc__)
482 def t_LBRACK(self, t):
483 t.lexer.brack_count += 1
484 t.value = pc_ast.LBracket(t.value)
485 return t
486
487 @lex.TOKEN(pc_ast.RBracket.__doc__)
488 def t_RBRACK(self, t):
489 t.lexer.brack_count -= 1
490 t.value = pc_ast.RBracket(t.value)
491 return t
492
493 @lex.TOKEN(pc_ast.LParenthesis.__doc__)
494 def t_LPAR(self, t):
495 t.lexer.paren_count += 1
496 t.value = pc_ast.LParenthesis(t.value)
497 return t
498
499 @lex.TOKEN(pc_ast.RParenthesis.__doc__)
500 def t_RPAR(self, t):
501 t.lexer.paren_count -= 1
502 t.value = pc_ast.RParenthesis(t.value)
503 return t
504
505 def t_error(self, t):
506 raise_syntax_error("Unknown symbol %r" % (t.value[0],),
507 self.filename, t.lexer.lineno,
508 t.lexer.lexpos, t.lexer.lexdata)
509 t.lexer.skip(1)
510
511
512 # Combine Ply and my filters into a new lexer
513
514 class IndentLexer(Lexer):
515 def __init__(self, debug=False, optimize=False, lextab="lextab"):
516 self.debug = debug
517 self.build(debug=debug, optimize=optimize, lextab=lextab)
518 self.token_stream = None
519
520 def input(self, s, add_endmarker=True):
521 s = annoying_case_hack_filter(s)
522 s += "\n"
523 self.lexer.paren_count = 0
524 self.lexer.brack_count = 0
525 self.lexer.lineno = 1
526 self.lexer.input(s)
527 self.token_stream = filter(self.lexer, add_endmarker, self.filename)
528
529 def token(self):
530 # The simplest way to convert "simple" tokens to classes.
531 # Functions won't work due to ply reliability on __code__.
532 # We end up with (LT+MINUS) instead of ASSIGN otherwise.
533 mapping = {
534 "COLON": pc_ast.Colon,
535 "EQ": pc_ast.Eq,
536 "ASSIGNEA": pc_ast.AssignIEAOp,
537 "ASSIGN": pc_ast.AssignOp,
538 "LTU": pc_ast.LtU,
539 "GTU": pc_ast.GtU,
540 "NE": pc_ast.NotEq,
541 "LE": pc_ast.Le,
542 "GE": pc_ast.Ge,
543 "LSHIFT": pc_ast.LShift,
544 "RSHIFT": pc_ast.RShift,
545 "LT": pc_ast.Lt,
546 "GT": pc_ast.Gt,
547 "PLUS": pc_ast.Add,
548 "MINUS": pc_ast.Sub,
549 "MULT": pc_ast.Mul,
550 "DIV": pc_ast.Div,
551 "MOD": pc_ast.Mod,
552 "INVERT": pc_ast.Not,
553 "COMMA": pc_ast.Comma,
554 "PERIOD": pc_ast.Period,
555 "SEMICOLON": pc_ast.Semicolon,
556 "APPEND": pc_ast.BitConcat,
557 "BITOR": pc_ast.BitOr,
558 "BITAND": pc_ast.BitAnd,
559 "BITXOR": pc_ast.BitXor,
560 "QMARK": pc_ast.Question,
561 }
562 try:
563 t = next(self.token_stream)
564 if t is not None:
565 if t.type in mapping:
566 t.value = mapping[t.type](t.value)
567 return t
568 except StopIteration:
569 return None