src/openpower/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from openpower.decoder.selectable_int import SelectableInt
  14
  15
  16 class SyntaxError2(Exception):
  17     """ class used to raise a syntax error but get ply to stop eating errors
  18     since it catches and discards SyntaxError after setting a flag.
  19     """
  20
  21     def __init__(self, *args, cls=SyntaxError):
  22         super().__init__(*args)
  23         self.cls = cls
  24
  25     def __repr__(self):
  26         return repr(self.cls(*self.args))
  27
  28     def __str__(self):
  29         return str(self.cls(*self.args))
  30
  31     def raise_syntax_error(self):
  32         raise self.cls(*self.args) from self
  33
  34
  35 def raise_syntax_error(msg, filename, lineno, lexpos, input_text,
  36                        cls=SyntaxError):
  37     line_start = input_text.rfind('\n', 0, lexpos) + 1
  38     line_end = input_text.find('\n', line_start)
  39     col = (lexpos - line_start) + 1
  40     raise SyntaxError2(str(msg), (filename, lineno, col,
  41                                   input_text[line_start:line_end]), cls=cls)
  42
  43 # I implemented INDENT / DEDENT generation as a post-processing filter
  44
  45 # The original lex token stream contains WS and NEWLINE characters.
  46 # WS will only occur before any other tokens on a line.
  47
  48 # I have three filters.  One tags tokens by adding two attributes.
  49 # "must_indent" is True if the token must be indented from the
  50 # previous code.  The other is "at_line_start" which is True for WS
  51 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  52 # see if the new line has changed indication level.
  53
  54 # Python's syntax has three INDENT states
  55 #  0) no colon hence no need to indent
  56 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  57 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  58 NO_INDENT = 0
  59 MAY_INDENT = 1
  60 MUST_INDENT = 2
  61
  62 # turn into python-like colon syntax from pseudo-code syntax.
  63 # identify tokens which tell us whether a "hidden colon" is needed.
  64 # this in turn means that track_tokens_filter "works" without needing
  65 # complex grammar rules
  66
  67
  68 def python_colonify(lexer, tokens):
  69
  70     implied_colon_needed = False
  71     for token in tokens:
  72         #print ("track colon token", token, token.type)
  73
  74         if token.type == 'THEN':
  75             # turn then into colon
  76             token.type = "COLON"
  77             yield token
  78         elif token.type == 'ELSE':
  79             yield token
  80             token = copy(token)
  81             token.type = "COLON"
  82             yield token
  83         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
  84             implied_colon_needed = True
  85             yield token
  86         elif token.type == 'NEWLINE':
  87             if implied_colon_needed:
  88                 ctok = copy(token)
  89                 ctok.type = "COLON"
  90                 yield ctok
  91                 implied_colon_needed = False
  92             yield token
  93         else:
  94             yield token
  95
  96
  97 # only care about whitespace at the start of a line
  98 def track_tokens_filter(lexer, tokens):
  99     oldignore = lexer.lexignore
 100     lexer.at_line_start = at_line_start = True
 101     indent = NO_INDENT
 102     saw_colon = False
 103     for token in tokens:
 104         #print ("track token", token, token.type)
 105         token.at_line_start = at_line_start
 106
 107         if token.type == "COLON":
 108             at_line_start = False
 109             indent = MAY_INDENT
 110             token.must_indent = False
 111
 112         elif token.type == "NEWLINE":
 113             at_line_start = True
 114             if indent == MAY_INDENT:
 115                 indent = MUST_INDENT
 116             token.must_indent = False
 117
 118         elif token.type == "WS":
 119             assert token.at_line_start == True
 120             at_line_start = True
 121             token.must_indent = False
 122
 123         else:
 124             # A real token; only indent after COLON NEWLINE
 125             if indent == MUST_INDENT:
 126                 token.must_indent = True
 127             else:
 128                 token.must_indent = False
 129             at_line_start = False
 130             indent = NO_INDENT
 131
 132         # really bad hack that changes ignore lexer state.
 133         # when "must indent" is seen (basically "real tokens" seen)
 134         # then ignore whitespace.
 135         if token.must_indent:
 136             lexer.lexignore = ('ignore', ' ')
 137         else:
 138             lexer.lexignore = oldignore
 139
 140         token.indent = indent
 141         yield token
 142         lexer.at_line_start = at_line_start
 143
 144
 145 def _new_token(type, lineno):
 146     tok = lex.LexToken()
 147     tok.type = type
 148     tok.value = None
 149     tok.lineno = lineno
 150     tok.lexpos = -1
 151     return tok
 152
 153 # Synthesize a DEDENT tag
 154
 155
 156 def DEDENT(lineno):
 157     return _new_token("DEDENT", lineno)
 158
 159 # Synthesize an INDENT tag
 160
 161
 162 def INDENT(lineno):
 163     return _new_token("INDENT", lineno)
 164
 165
 166 def count_spaces(l):
 167     for i in range(len(l)):
 168         if l[i] != ' ':
 169             return i
 170     return 0
 171
 172
 173 def annoying_case_hack_filter(code):
 174     """add annoying "silent keyword" (fallthrough)
 175
 176     this which tricks the parser into taking the (silent) case statement
 177     as a "small expression".  it can then be spotted and used to indicate
 178     "fall through" to the next case (in the parser)
 179
 180     also skips blank lines
 181
 182     bugs: any function that starts with the letters "case" or "default"
 183     will be detected erroneously.  fixing that involves doing a token
 184     lexer which spots the fact that "case" and "default" are words,
 185     separating them from space, colon, bracket etc.
 186
 187     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 188     """
 189     res = []
 190     prev_spc_count = None
 191     for l in code.split("\n"):
 192         spc_count = count_spaces(l)
 193         nwhite = l[spc_count:]
 194         if len(nwhite) == 0:  # skip blank lines
 195             res.append('')
 196             continue
 197         if nwhite.startswith("case") or nwhite.startswith("default"):
 198             #print ("case/default", nwhite, spc_count, prev_spc_count)
 199             if (prev_spc_count is not None and
 200                 prev_spc_count == spc_count and
 201                     (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 202                 res[-1] += " fallthrough"  # add to previous line
 203             prev_spc_count = spc_count
 204         else:
 205             #print ("notstarts", spc_count, nwhite)
 206             prev_spc_count = None
 207         res.append(l)
 208     return '\n'.join(res)
 209
 210
 211 # Track the indentation level and emit the right INDENT / DEDENT events.
 212 def indentation_filter(tokens, filename):
 213     # A stack of indentation levels; will never pop item 0
 214     levels = [0]
 215     token = None
 216     depth = 0
 217     prev_was_ws = False
 218     for token in tokens:
 219         if 0:
 220             print("Process", depth, token.indent, token,)
 221             if token.at_line_start:
 222                 print("at_line_start",)
 223             if token.must_indent:
 224                 print("must_indent",)
 225             print
 226
 227         # WS only occurs at the start of the line
 228         # There may be WS followed by NEWLINE so
 229         # only track the depth here.  Don't indent/dedent
 230         # until there's something real.
 231         if token.type == "WS":
 232             assert depth == 0
 233             depth = len(token.value)
 234             prev_was_ws = True
 235             # WS tokens are never passed to the parser
 236             continue
 237
 238         if token.type == "NEWLINE":
 239             depth = 0
 240             if prev_was_ws or token.at_line_start:
 241                 # ignore blank lines
 242                 continue
 243             # pass the other cases on through
 244             yield token
 245             continue
 246
 247         # then it must be a real token (not WS, not NEWLINE)
 248         # which can affect the indentation level
 249
 250         prev_was_ws = False
 251         if token.must_indent:
 252             # The current depth must be larger than the previous level
 253             if not (depth > levels[-1]):
 254                 raise_syntax_error("expected an indented block",
 255                                    filename, token.lexer.lineno,
 256                                    token.lexer.lexpos, token.lexer.lexdata,
 257                                    cls=IndentationError)
 258
 259             levels.append(depth)
 260             yield INDENT(token.lineno)
 261
 262         elif token.at_line_start:
 263             # Must be on the same level or one of the previous levels
 264             if depth == levels[-1]:
 265                 # At the same level
 266                 pass
 267             elif depth > levels[-1]:
 268                 raise_syntax_error("indent increase but not in new block",
 269                                    filename, token.lexer.lineno,
 270                                    token.lexer.lexpos, token.lexer.lexdata,
 271                                    cls=IndentationError)
 272             else:
 273                 # Back up; but only if it matches a previous level
 274                 try:
 275                     i = levels.index(depth)
 276                 except ValueError:
 277                     raise_syntax_error("inconsistent indentation",
 278                                        filename, token.lexer.lineno,
 279                                        token.lexer.lexpos, token.lexer.lexdata,
 280                                        cls=IndentationError)
 281                 for _ in range(i+1, len(levels)):
 282                     yield DEDENT(token.lineno)
 283                     levels.pop()
 284
 285         yield token
 286
 287     ### Finished processing ###
 288
 289     # Must dedent any remaining levels
 290     if len(levels) > 1:
 291         assert token is not None
 292         for _ in range(1, len(levels)):
 293             yield DEDENT(token.lineno)
 294
 295
 296 # The top-level filter adds an ENDMARKER, if requested.
 297 # Python's grammar uses it.
 298 def filter(lexer, add_endmarker, filename):
 299     token = None
 300     tokens = iter(lexer.token, None)
 301     tokens = python_colonify(lexer, tokens)
 302     tokens = track_tokens_filter(lexer, tokens)
 303     for token in indentation_filter(tokens, filename):
 304         yield token
 305
 306     if add_endmarker:
 307         lineno = 1
 308         if token is not None:
 309             lineno = token.lineno
 310         yield _new_token("ENDMARKER", lineno)
 311
 312
 313 KEYWORD_REPLACEMENTS = {'class': 'class_'}
 314
 315 ##### Lexer ######
 316
 317
 318 class PowerLexer:
 319     tokens = (
 320         'DEF',
 321         'IF',
 322         'THEN',
 323         'ELSE',
 324         'FOR',
 325         'TO',
 326         'DO',
 327         'WHILE',
 328         'BREAK',
 329         'NAME',
 330         'HEX',     # hex numbers
 331         'NUMBER',  # Python decimals
 332         'BINARY',  # Python binary
 333         'STRING',  # single quoted strings only; syntax of raw strings
 334         'LPAR',
 335         'RPAR',
 336         'LBRACK',
 337         'RBRACK',
 338         'COLON',
 339         'EQ',
 340         'ASSIGNEA',
 341         'ASSIGN',
 342         'LTU',
 343         'GTU',
 344         'NE',
 345         'LE',
 346         'GE',
 347         'LT',
 348         'GT',
 349         'PLUS',
 350         'MINUS',
 351         'MULT',
 352         'DIV',
 353         'MOD',
 354         'INVERT',
 355         'APPEND',
 356         'BITOR',
 357         'BITAND',
 358         'BITXOR',
 359         'RETURN',
 360         'SWITCH',
 361         'CASE',
 362         'DEFAULT',
 363         'WS',
 364         'NEWLINE',
 365         'COMMA',
 366         'QMARK',
 367         'PERIOD',
 368         'SEMICOLON',
 369         'INDENT',
 370         'DEDENT',
 371         'ENDMARKER',
 372     )
 373
 374     # Build the lexer
 375     def build(self, **kwargs):
 376         self.lexer = lex.lex(module=self, **kwargs)
 377         self.filename = None
 378
 379     def t_HEX(self, t):
 380         r"""0x[0-9a-fA-F_]+"""
 381         val = t.value.replace("_", "")
 382         t.value = SelectableInt(int(val, 16), (len(val)-2)*4)  # hex = nibble
 383         return t
 384
 385     def t_BINARY(self, t):
 386         r"""0b[01_]+"""
 387         val = t.value.replace("_", "")
 388         t.value = SelectableInt(int(val, 2), len(val)-2)
 389         return t
 390
 391     #t_NUMBER = r'\d+'
 392     # taken from decmial.py but without the leading sign
 393     def t_NUMBER(self, t):
 394         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 395         t.value = int(t.value)
 396         return t
 397
 398     def t_STRING(self, t):
 399         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 400         print(repr(t.value))
 401         t.value = t.value[1:-1]
 402         return t
 403
 404     t_COLON = r':'
 405     t_EQ = r'='
 406     t_ASSIGNEA = r'<-iea'
 407     t_ASSIGN = r'<-'
 408     t_LTU = r'<u'
 409     t_GTU = r'>u'
 410     t_NE = r'!='
 411     t_LE = r'<='
 412     t_GE = r'>='
 413     t_LT = r'<'
 414     t_GT = r'>'
 415     t_PLUS = r'\+'
 416     t_MINUS = r'-'
 417     t_MULT = r'\*'
 418     t_DIV = r'/'
 419     t_MOD = r'%'
 420     t_INVERT = r'¬'
 421     t_COMMA = r','
 422     t_PERIOD = r'.'
 423     t_SEMICOLON = r';'
 424     t_APPEND = r'\|\|'
 425     t_BITOR = r'\|'
 426     t_BITAND = r'\&'
 427     t_BITXOR = r'\^'
 428     t_QMARK = r'\?'
 429
 430     # Ply nicely documented how to do this.
 431
 432     RESERVED = {
 433         "def": "DEF",
 434         "if": "IF",
 435         "then": "THEN",
 436         "else": "ELSE",
 437         "leave": "BREAK",
 438         "for": "FOR",
 439         "to": "TO",
 440         "while": "WHILE",
 441         "do": "DO",
 442         "return": "RETURN",
 443         "switch": "SWITCH",
 444         "case": "CASE",
 445         "default": "DEFAULT",
 446     }
 447
 448     def t_NAME(self, t):
 449         r'[a-zA-Z_][a-zA-Z0-9_]*'
 450         t.type = self.RESERVED.get(t.value, "NAME")
 451         if t.value in KEYWORD_REPLACEMENTS:
 452             t.value = KEYWORD_REPLACEMENTS[t.value]
 453         return t
 454
 455     # Putting this before t_WS let it consume lines with only comments in
 456     # them so the latter code never sees the WS part.  Not consuming the
 457     # newline.  Needed for "if 1: #comment"
 458     def t_comment(self, t):
 459         r"[ ]*\043[^\n]*"  # \043 is '#'
 460         pass
 461
 462     # Whitespace
 463
 464     def t_WS(self, t):
 465         r'[ ]+'
 466         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 467                 t.lexer.brack_count == 0:
 468             return t
 469
 470     # Don't generate newline tokens when inside of parenthesis, eg
 471     #   a = (1,
 472     #        2, 3)
 473     def t_newline(self, t):
 474         r'\n+'
 475         t.lexer.lineno += len(t.value)
 476         t.type = "NEWLINE"
 477         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 478             return t
 479
 480     def t_LBRACK(self, t):
 481         r'\['
 482         t.lexer.brack_count += 1
 483         return t
 484
 485     def t_RBRACK(self, t):
 486         r'\]'
 487         # check for underflow?  should be the job of the parser
 488         t.lexer.brack_count -= 1
 489         return t
 490
 491     def t_LPAR(self, t):
 492         r'\('
 493         t.lexer.paren_count += 1
 494         return t
 495
 496     def t_RPAR(self, t):
 497         r'\)'
 498         # check for underflow?  should be the job of the parser
 499         t.lexer.paren_count -= 1
 500         return t
 501
 502     #t_ignore = " "
 503
 504     def t_error(self, t):
 505         raise_syntax_error("Unknown symbol %r" % (t.value[0],),
 506                            self.filename, t.lexer.lineno,
 507                            t.lexer.lexpos, t.lexer.lexdata)
 508         print("Skipping", repr(t.value[0]))
 509         t.lexer.skip(1)
 510
 511
 512 # Combine Ply and my filters into a new lexer
 513
 514 class IndentLexer(PowerLexer):
 515     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 516         self.debug = debug
 517         self.build(debug=debug, optimize=optimize,
 518                    lextab=lextab, reflags=reflags)
 519         self.token_stream = None
 520
 521     def input(self, s, add_endmarker=True):
 522         s = annoying_case_hack_filter(s)
 523         if self.debug:
 524             print(s)
 525         s += "\n"
 526         self.lexer.paren_count = 0
 527         self.lexer.brack_count = 0
 528         self.lexer.lineno = 1
 529         self.lexer.input(s)
 530         self.token_stream = filter(self.lexer, add_endmarker, self.filename)
 531
 532     def token(self):
 533         try:
 534             return next(self.token_stream)
 535         except StopIteration:
 536             return None
 537
 538
 539 switchtest = """
 540 switch (n)
 541     case(1): x <- 5
 542     case(3): x <- 2
 543     case(2):
 544
 545     case(4):
 546         x <- 3
 547     case(9):
 548
 549     default:
 550         x <- 9
 551 print (5)
 552 """
 553
 554 cnttzd = """
 555 n  <- 0
 556 do while n < 64
 557    if (RS)[63-n] = 0b1 then
 558         leave
 559    n  <- n + 1
 560 RA <- EXTZ64(n)
 561 print (RA)
 562 """
 563
 564 if __name__ == '__main__':
 565
 566     # quick test/demo
 567     #code = cnttzd
 568     code = switchtest
 569     print(code)
 570
 571     lexer = IndentLexer(debug=1)
 572     # Give the lexer some input
 573     print("code")
 574     print(code)
 575     lexer.input(code)
 576
 577     tokens = iter(lexer.token, None)
 578     for token in tokens:
 579         print(token)