src/openpower/oppc/pc_lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11
  12 from copy import copy
  13
  14 from ply import lex
  15
  16 import openpower.oppc.pc_ast as pc_ast
  17
  18
  19 def bind(cls):
  20     def wrapper(self, t):
  21         t.value = cls(t.value)
  22         return t
  23
  24     wrapper.__name__ = cls.__name__
  25     wrapper.__doc__ = cls.__doc__
  26
  27     return wrapper
  28
  29
  30 class SyntaxError2(Exception):
  31     """ class used to raise a syntax error but get ply to stop eating errors
  32     since it catches and discards SyntaxError after setting a flag.
  33     """
  34
  35     def __init__(self, *args, cls=SyntaxError):
  36         super().__init__(*args)
  37         self.cls = cls
  38
  39     def __repr__(self):
  40         return repr(self.cls(*self.args))
  41
  42     def __str__(self):
  43         return str(self.cls(*self.args))
  44
  45     def raise_syntax_error(self):
  46         raise self.cls(*self.args) from self
  47
  48
  49 def raise_syntax_error(msg, filename, lineno, lexpos, input_text,
  50                        cls=SyntaxError):
  51     line_start = input_text.rfind("\n", 0, lexpos) + 1
  52     line_end = input_text.find("\n", line_start)
  53     col = (lexpos - line_start) + 1
  54     raise SyntaxError2(str(msg), (filename, lineno, col,
  55                                   input_text[line_start:line_end]), cls=cls)
  56
  57 # I implemented INDENT / DEDENT generation as a post-processing filter
  58
  59 # The original lex token stream contains WS and NEWLINE characters.
  60 # WS will only occur before any other tokens on a line.
  61
  62 # I have three filters.  One tags tokens by adding two attributes.
  63 # "must_indent" is True if the token must be indented from the
  64 # previous code.  The other is "at_line_start" which is True for WS
  65 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  66 # see if the new line has changed indication level.
  67
  68 # Python's syntax has three INDENT states
  69 #  0) no colon hence no need to indent
  70 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  71 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  72 NO_INDENT = 0
  73 MAY_INDENT = 1
  74 MUST_INDENT = 2
  75
  76 # turn into python-like colon syntax from pseudo-code syntax.
  77 # identify tokens which tell us whether a "hidden colon" is needed.
  78 # this in turn means that track_tokens_filter "works" without needing
  79 # complex grammar rules
  80
  81
  82 def python_colonify(lexer, tokens):
  83     implied_colon_needed = False
  84     for token in tokens:
  85         if token.type == "THEN":
  86             # turn then into colon
  87             token.type = "COLON"
  88             token.value = pc_ast.Colon(str(token.value))
  89             yield token
  90         elif token.type == "ELSE":
  91             yield token
  92             token = copy(token)
  93             token.type = "COLON"
  94             token.value = pc_ast.Colon(str(token.value))
  95             yield token
  96         elif token.type in ["DO", "WHILE", "FOR", "SWITCH"]:
  97             implied_colon_needed = True
  98             yield token
  99         elif token.type == "NEWLINE":
 100             if implied_colon_needed:
 101                 ctok = copy(token)
 102                 ctok.type = "COLON"
 103                 ctok.value = pc_ast.Colon(str(token.value))
 104                 yield ctok
 105                 implied_colon_needed = False
 106             yield token
 107         else:
 108             yield token
 109
 110
 111 # only care about whitespace at the start of a line
 112 def track_tokens_filter(lexer, tokens):
 113     oldignore = lexer.lexignore
 114     lexer.at_line_start = at_line_start = True
 115     indent = NO_INDENT
 116     saw_colon = False
 117     for token in tokens:
 118         token.at_line_start = at_line_start
 119
 120         if token.type == "COLON":
 121             at_line_start = False
 122             indent = MAY_INDENT
 123             token.must_indent = False
 124
 125         elif token.type == "NEWLINE":
 126             at_line_start = True
 127             if indent == MAY_INDENT:
 128                 indent = MUST_INDENT
 129             token.must_indent = False
 130
 131         elif token.type == "WS":
 132             assert token.at_line_start == True
 133             at_line_start = True
 134             token.must_indent = False
 135
 136         else:
 137             # A real token; only indent after COLON NEWLINE
 138             if indent == MUST_INDENT:
 139                 token.must_indent = True
 140             else:
 141                 token.must_indent = False
 142             at_line_start = False
 143             indent = NO_INDENT
 144
 145         # really bad hack that changes ignore lexer state.
 146         # when "must indent" is seen (basically "real tokens" seen)
 147         # then ignore whitespace.
 148         if token.must_indent:
 149             lexer.lexignore = ("ignore", " ")
 150         else:
 151             lexer.lexignore = oldignore
 152
 153         token.indent = indent
 154         yield token
 155         lexer.at_line_start = at_line_start
 156
 157
 158 def _new_token(type, lineno):
 159     cls = {
 160         "ENDMARKER": pc_ast.Endmarker,
 161         "INDENT": pc_ast.Indent,
 162         "DEDENT": pc_ast.Dedent,
 163     }[type]
 164     tok = lex.LexToken()
 165     tok.type = type
 166     tok.value = cls()
 167     tok.lineno = lineno
 168     tok.lexpos = -1
 169     return tok
 170
 171 # Synthesize a DEDENT tag
 172
 173
 174 def DEDENT(lineno):
 175     return _new_token("DEDENT", lineno)
 176
 177 # Synthesize an INDENT tag
 178
 179
 180 def INDENT(lineno):
 181     return _new_token("INDENT", lineno)
 182
 183
 184 def count_spaces(l):
 185     for i in range(len(l)):
 186         if l[i] != " ":
 187             return i
 188     return 0
 189
 190
 191 def annoying_case_hack_filter(code):
 192     """add annoying "silent keyword" (fallthrough)
 193
 194     this which tricks the parser into taking the (silent) case statement
 195     as a "small expression".  it can then be spotted and used to indicate
 196     "fall through" to the next case (in the parser)
 197
 198     also skips blank lines
 199
 200     bugs: any function that starts with the letters "case" or "default"
 201     will be detected erroneously.  fixing that involves doing a token
 202     lexer which spots the fact that "case" and "default" are words,
 203     separating them from space, colon, bracket etc.
 204
 205     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 206     """
 207     res = []
 208     prev_spc_count = None
 209     for l in code.split("\n"):
 210         spc_count = count_spaces(l)
 211         nwhite = l[spc_count:]
 212         if len(nwhite) == 0:  # skip blank lines
 213             res.append("")
 214             continue
 215         if nwhite.startswith("case") or nwhite.startswith("default"):
 216             if (prev_spc_count is not None and
 217                 prev_spc_count == spc_count and
 218                     (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 219                 res[-1] += " fallthrough"  # add to previous line
 220             prev_spc_count = spc_count
 221         else:
 222             prev_spc_count = None
 223         res.append(l)
 224     return "\n".join(res)
 225
 226
 227 # Track the indentation level and emit the right INDENT / DEDENT events.
 228 def indentation_filter(tokens, filename):
 229     # A stack of indentation levels; will never pop item 0
 230     levels = [0]
 231     token = None
 232     depth = 0
 233     prev_was_ws = False
 234     for token in tokens:
 235         # WS only occurs at the start of the line
 236         # There may be WS followed by NEWLINE so
 237         # only track the depth here.  Don't indent/dedent
 238         # until there's something real.
 239         if token.type == "WS":
 240             assert depth == 0
 241             depth = len(token.value)
 242             prev_was_ws = True
 243             # WS tokens are never passed to the parser
 244             continue
 245
 246         if token.type == "NEWLINE":
 247             depth = 0
 248             if prev_was_ws or token.at_line_start:
 249                 # ignore blank lines
 250                 continue
 251             # pass the other cases on through
 252             yield token
 253             continue
 254
 255         # then it must be a real token (not WS, not NEWLINE)
 256         # which can affect the indentation level
 257
 258         prev_was_ws = False
 259         if token.must_indent:
 260             # The current depth must be larger than the previous level
 261             if not (depth > levels[-1]):
 262                 raise_syntax_error("expected an indented block",
 263                                    filename, token.lexer.lineno,
 264                                    token.lexer.lexpos, token.lexer.lexdata,
 265                                    cls=IndentationError)
 266
 267             levels.append(depth)
 268             yield INDENT(token.lineno)
 269
 270         elif token.at_line_start:
 271             # Must be on the same level or one of the previous levels
 272             if depth == levels[-1]:
 273                 # At the same level
 274                 pass
 275             elif depth > levels[-1]:
 276                 raise_syntax_error("indent increase but not in new block",
 277                                    filename, token.lexer.lineno,
 278                                    token.lexer.lexpos, token.lexer.lexdata,
 279                                    cls=IndentationError)
 280             else:
 281                 # Back up; but only if it matches a previous level
 282                 try:
 283                     i = levels.index(depth)
 284                 except ValueError:
 285                     raise_syntax_error("inconsistent indentation",
 286                                        filename, token.lexer.lineno,
 287                                        token.lexer.lexpos, token.lexer.lexdata,
 288                                        cls=IndentationError)
 289                 for _ in range(i+1, len(levels)):
 290                     yield DEDENT(token.lineno)
 291                     levels.pop()
 292
 293         yield token
 294
 295     ### Finished processing ###
 296
 297     # Must dedent any remaining levels
 298     if len(levels) > 1:
 299         assert token is not None
 300         for _ in range(1, len(levels)):
 301             yield DEDENT(token.lineno)
 302
 303
 304 # The top-level filter adds an ENDMARKER, if requested.
 305 # Python's grammar uses it.
 306 def filter(lexer, add_endmarker, filename):
 307     token = None
 308     tokens = iter(lexer.token, None)
 309     tokens = python_colonify(lexer, tokens)
 310     tokens = track_tokens_filter(lexer, tokens)
 311     for token in indentation_filter(tokens, filename):
 312         yield token
 313
 314     if add_endmarker:
 315         lineno = 1
 316         if token is not None:
 317             lineno = token.lineno
 318         yield _new_token("ENDMARKER", lineno)
 319
 320
 321 ##### Lexer ######
 322 class Lexer:
 323     tokens = (
 324         "DEF",
 325         "IF",
 326         "THEN",
 327         "ELSE",
 328         "FOR",
 329         "TO",
 330         "DO",
 331         "WHILE",
 332         "BREAK",
 333         "NAME",
 334         "HEX",     # hex numbers
 335         "NUMBER",  # Python decimals
 336         "BINARY",  # Python binary
 337         "STRING",  # single quoted strings only; syntax of raw strings
 338         "LPAR",
 339         "RPAR",
 340         "LBRACK",
 341         "RBRACK",
 342         "COLON",
 343         "EQ",
 344         "ASSIGNEA",
 345         "ASSIGN",
 346         "LTU",
 347         "GTU",
 348         "NE",
 349         "LE",
 350         "LSHIFT",
 351         "RSHIFT",
 352         "GE",
 353         "LT",
 354         "GT",
 355         "PLUS",
 356         "MINUS",
 357         "MULT",
 358         "DIV",
 359         "MOD",
 360         "INVERT",
 361         "APPEND",
 362         "BITOR",
 363         "BITAND",
 364         "BITXOR",
 365         "RETURN",
 366         "SWITCH",
 367         "CASE",
 368         "DEFAULT",
 369         "WS",
 370         "NEWLINE",
 371         "COMMA",
 372         "QMARK",
 373         "PERIOD",
 374         "SEMICOLON",
 375         "INDENT",
 376         "DEDENT",
 377         "ENDMARKER",
 378     )
 379
 380     # Build the lexer
 381     def build(self, **kwargs):
 382         self.lexer = lex.lex(module=self, **kwargs)
 383         self.filename = None
 384
 385     @lex.TOKEN(pc_ast.HexLiteral.__doc__)
 386     def t_HEX(self, t):
 387         t.value = pc_ast.HexLiteral(t.value)
 388         return t
 389
 390     @lex.TOKEN(pc_ast.BinLiteral.__doc__)
 391     def t_BINARY(self, t):
 392         t.value = pc_ast.BinLiteral(t.value)
 393         return t
 394
 395     @lex.TOKEN(pc_ast.DecLiteral.__doc__)
 396     def t_NUMBER(self, t):
 397         t.value = pc_ast.DecLiteral(t.value)
 398         return t
 399
 400     @lex.TOKEN(pc_ast.StringLiteral.__doc__)
 401     def t_STRING(self, t):
 402         t.value = pc_ast.StringLiteral(t.value[1:-1])
 403         return t
 404
 405     t_COLON = pc_ast.Colon.__doc__
 406     t_EQ = pc_ast.Eq.__doc__
 407     t_ASSIGNEA = pc_ast.AssignIEAOp.__doc__
 408     t_ASSIGN = pc_ast.AssignOp.__doc__
 409     t_LTU = pc_ast.LtU.__doc__
 410     t_GTU = pc_ast.GtU.__doc__
 411     t_NE = pc_ast.NotEq.__doc__
 412     t_LE = pc_ast.Le.__doc__
 413     t_GE = pc_ast.Ge.__doc__
 414     t_LSHIFT = pc_ast.LShift.__doc__
 415     t_RSHIFT = pc_ast.RShift.__doc__
 416     t_LT = pc_ast.Lt.__doc__
 417     t_GT = pc_ast.Gt.__doc__
 418     t_PLUS = pc_ast.Add.__doc__
 419     t_MINUS = pc_ast.Sub.__doc__
 420     t_MULT = pc_ast.Mul.__doc__
 421     t_DIV = pc_ast.Div.__doc__
 422     t_MOD = pc_ast.Mod.__doc__
 423     t_INVERT = pc_ast.Not.__doc__
 424     t_COMMA = pc_ast.Comma.__doc__
 425     t_PERIOD = pc_ast.Period.__doc__
 426     t_SEMICOLON = pc_ast.Semicolon.__doc__
 427     t_APPEND = pc_ast.BitConcat.__doc__
 428     t_BITOR = pc_ast.BitOr.__doc__
 429     t_BITAND = pc_ast.BitAnd.__doc__
 430     t_BITXOR = pc_ast.BitXor.__doc__
 431     t_QMARK = pc_ast.Question.__doc__
 432
 433     @lex.TOKEN(pc_ast.Symbol)
 434     def t_NAME(self, t):
 435         keywords = {
 436             "def": ("DEF", pc_ast.FunctionKeyword),
 437             "if": ("IF", pc_ast.IfKeyword),
 438             "then": ("THEN", pc_ast.ThenKeyword),
 439             "else": ("ELSE", pc_ast.ElseKeyword),
 440             "leave": ("BREAK", pc_ast.LeaveKeyword),
 441             "for": ("FOR", pc_ast.ForKeyword),
 442             "to": ("TO", pc_ast.ToKeyword),
 443             "while": ("WHILE", pc_ast.WhileKeyword),
 444             "do": ("DO", pc_ast.DoKeyword),
 445             "return": ("RETURN", pc_ast.ReturnKeyword),
 446             "switch": ("SWITCH", pc_ast.SwitchKeyword),
 447             "case": ("CASE", pc_ast.CaseKeyword),
 448             "default": ("DEFAULT", pc_ast.DefaultKeyword),
 449         }
 450         (tt, tcls) = keywords.get(t.value, ("NAME", pc_ast.Symbol))
 451         t.type = tt
 452         t.value = tcls(t.value)
 453         return t
 454
 455     # Putting this before t_WS let it consume lines with only comments in
 456     # them so the latter code never sees the WS part.  Not consuming the
 457     # newline.  Needed for "if 1: #comment"
 458     @lex.TOKEN(pc_ast.Comment.__doc__)
 459     def t_comment(self, t):
 460         return None
 461
 462     # Whitespace
 463     @lex.TOKEN(pc_ast.Whitespace.__doc__)
 464     def t_WS(self, t):
 465         if (t.lexer.at_line_start and
 466                 t.lexer.paren_count == 0 and \
 467                 t.lexer.brack_count == 0):
 468             return t
 469
 470     # Don't generate newline tokens when inside of parenthesis, eg
 471     #   a = (1,
 472     #        2, 3)
 473     @lex.TOKEN(pc_ast.Linebreak.__doc__)
 474     def t_newline(self, t):
 475         t.lexer.lineno += len(t.value)
 476         t.value = pc_ast.Linebreak(t.value)
 477         t.type = "NEWLINE"
 478         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 479             return t
 480
 481     @lex.TOKEN(pc_ast.LBracket.__doc__)
 482     def t_LBRACK(self, t):
 483         t.lexer.brack_count += 1
 484         t.value = pc_ast.LBracket(t.value)
 485         return t
 486
 487     @lex.TOKEN(pc_ast.RBracket.__doc__)
 488     def t_RBRACK(self, t):
 489         t.lexer.brack_count -= 1
 490         t.value = pc_ast.RBracket(t.value)
 491         return t
 492
 493     @lex.TOKEN(pc_ast.LParenthesis.__doc__)
 494     def t_LPAR(self, t):
 495         t.lexer.paren_count += 1
 496         t.value = pc_ast.LParenthesis(t.value)
 497         return t
 498
 499     @lex.TOKEN(pc_ast.RParenthesis.__doc__)
 500     def t_RPAR(self, t):
 501         t.lexer.paren_count -= 1
 502         t.value = pc_ast.RParenthesis(t.value)
 503         return t
 504
 505     def t_error(self, t):
 506         raise_syntax_error("Unknown symbol %r" % (t.value[0],),
 507                            self.filename, t.lexer.lineno,
 508                            t.lexer.lexpos, t.lexer.lexdata)
 509         t.lexer.skip(1)
 510
 511
 512 # Combine Ply and my filters into a new lexer
 513
 514 class IndentLexer(Lexer):
 515     def __init__(self, debug=False, optimize=False, lextab="lextab"):
 516         self.debug = debug
 517         self.build(debug=debug, optimize=optimize, lextab=lextab)
 518         self.token_stream = None
 519
 520     def input(self, s, add_endmarker=True):
 521         s = annoying_case_hack_filter(s)
 522         s += "\n"
 523         self.lexer.paren_count = 0
 524         self.lexer.brack_count = 0
 525         self.lexer.lineno = 1
 526         self.lexer.input(s)
 527         self.token_stream = filter(self.lexer, add_endmarker, self.filename)
 528
 529     def token(self):
 530         # The simplest way to convert "simple" tokens to classes.
 531         # Functions won't work due to ply reliability on __code__.
 532         # We end up with (LT+MINUS) instead of ASSIGN otherwise.
 533         mapping = {
 534             "COLON": pc_ast.Colon,
 535             "EQ": pc_ast.Eq,
 536             "ASSIGNEA": pc_ast.AssignIEAOp,
 537             "ASSIGN": pc_ast.AssignOp,
 538             "LTU": pc_ast.LtU,
 539             "GTU": pc_ast.GtU,
 540             "NE": pc_ast.NotEq,
 541             "LE": pc_ast.Le,
 542             "GE": pc_ast.Ge,
 543             "LSHIFT": pc_ast.LShift,
 544             "RSHIFT": pc_ast.RShift,
 545             "LT": pc_ast.Lt,
 546             "GT": pc_ast.Gt,
 547             "PLUS": pc_ast.Add,
 548             "MINUS": pc_ast.Sub,
 549             "MULT": pc_ast.Mul,
 550             "DIV": pc_ast.Div,
 551             "MOD": pc_ast.Mod,
 552             "INVERT": pc_ast.Not,
 553             "COMMA": pc_ast.Comma,
 554             "PERIOD": pc_ast.Period,
 555             "SEMICOLON": pc_ast.Semicolon,
 556             "APPEND": pc_ast.BitConcat,
 557             "BITOR": pc_ast.BitOr,
 558             "BITAND": pc_ast.BitAnd,
 559             "BITXOR": pc_ast.BitXor,
 560             "QMARK": pc_ast.Question,
 561         }
 562         try:
 563             t = next(self.token_stream)
 564             if t is not None:
 565                 if t.type in mapping:
 566                     t.value = mapping[t.type](t.value)
 567             return t
 568         except StopIteration:
 569             return None