src/openpower/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from openpower.decoder.selectable_int import SelectableInt
  14
  15 # I implemented INDENT / DEDENT generation as a post-processing filter
  16
  17 # The original lex token stream contains WS and NEWLINE characters.
  18 # WS will only occur before any other tokens on a line.
  19
  20 # I have three filters.  One tags tokens by adding two attributes.
  21 # "must_indent" is True if the token must be indented from the
  22 # previous code.  The other is "at_line_start" which is True for WS
  23 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  24 # see if the new line has changed indication level.
  25
  26 # Python's syntax has three INDENT states
  27 #  0) no colon hence no need to indent
  28 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  29 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  30 NO_INDENT = 0
  31 MAY_INDENT = 1
  32 MUST_INDENT = 2
  33
  34 # turn into python-like colon syntax from pseudo-code syntax.
  35 # identify tokens which tell us whether a "hidden colon" is needed.
  36 # this in turn means that track_tokens_filter "works" without needing
  37 # complex grammar rules
  38
  39
  40 def python_colonify(lexer, tokens):
  41
  42     implied_colon_needed = False
  43     for token in tokens:
  44         #print ("track colon token", token, token.type)
  45
  46         if token.type == 'THEN':
  47             # turn then into colon
  48             token.type = "COLON"
  49             yield token
  50         elif token.type == 'ELSE':
  51             yield token
  52             token = copy(token)
  53             token.type = "COLON"
  54             yield token
  55         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
  56             implied_colon_needed = True
  57             yield token
  58         elif token.type == 'NEWLINE':
  59             if implied_colon_needed:
  60                 ctok = copy(token)
  61                 ctok.type = "COLON"
  62                 yield ctok
  63                 implied_colon_needed = False
  64             yield token
  65         else:
  66             yield token
  67
  68
  69 # only care about whitespace at the start of a line
  70 def track_tokens_filter(lexer, tokens):
  71     oldignore = lexer.lexignore
  72     lexer.at_line_start = at_line_start = True
  73     indent = NO_INDENT
  74     saw_colon = False
  75     for token in tokens:
  76         #print ("track token", token, token.type)
  77         token.at_line_start = at_line_start
  78
  79         if token.type == "COLON":
  80             at_line_start = False
  81             indent = MAY_INDENT
  82             token.must_indent = False
  83
  84         elif token.type == "NEWLINE":
  85             at_line_start = True
  86             if indent == MAY_INDENT:
  87                 indent = MUST_INDENT
  88             token.must_indent = False
  89
  90         elif token.type == "WS":
  91             assert token.at_line_start == True
  92             at_line_start = True
  93             token.must_indent = False
  94
  95         else:
  96             # A real token; only indent after COLON NEWLINE
  97             if indent == MUST_INDENT:
  98                 token.must_indent = True
  99             else:
 100                 token.must_indent = False
 101             at_line_start = False
 102             indent = NO_INDENT
 103
 104         # really bad hack that changes ignore lexer state.
 105         # when "must indent" is seen (basically "real tokens" seen)
 106         # then ignore whitespace.
 107         if token.must_indent:
 108             lexer.lexignore = ('ignore', ' ')
 109         else:
 110             lexer.lexignore = oldignore
 111
 112         token.indent = indent
 113         yield token
 114         lexer.at_line_start = at_line_start
 115
 116
 117 def _new_token(type, lineno):
 118     tok = lex.LexToken()
 119     tok.type = type
 120     tok.value = None
 121     tok.lineno = lineno
 122     tok.lexpos = -1
 123     return tok
 124
 125 # Synthesize a DEDENT tag
 126
 127
 128 def DEDENT(lineno):
 129     return _new_token("DEDENT", lineno)
 130
 131 # Synthesize an INDENT tag
 132
 133
 134 def INDENT(lineno):
 135     return _new_token("INDENT", lineno)
 136
 137
 138 def count_spaces(l):
 139     for i in range(len(l)):
 140         if l[i] != ' ':
 141             return i
 142     return 0
 143
 144
 145 def annoying_case_hack_filter(code):
 146     """add annoying "silent keyword" (fallthrough)
 147
 148     this which tricks the parser into taking the (silent) case statement
 149     as a "small expression".  it can then be spotted and used to indicate
 150     "fall through" to the next case (in the parser)
 151
 152     also skips blank lines
 153
 154     bugs: any function that starts with the letters "case" or "default"
 155     will be detected erroneously.  fixing that involves doing a token
 156     lexer which spots the fact that "case" and "default" are words,
 157     separating them from space, colon, bracket etc.
 158
 159     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 160     """
 161     res = []
 162     prev_spc_count = None
 163     for l in code.split("\n"):
 164         spc_count = count_spaces(l)
 165         nwhite = l[spc_count:]
 166         if len(nwhite) == 0:  # skip blank lines
 167             res.append('')
 168             continue
 169         if nwhite.startswith("case") or nwhite.startswith("default"):
 170             #print ("case/default", nwhite, spc_count, prev_spc_count)
 171             if (prev_spc_count is not None and
 172                 prev_spc_count == spc_count and
 173                     (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 174                 res[-1] += " fallthrough"  # add to previous line
 175             prev_spc_count = spc_count
 176         else:
 177             #print ("notstarts", spc_count, nwhite)
 178             prev_spc_count = None
 179         res.append(l)
 180     return '\n'.join(res)
 181
 182
 183 # Track the indentation level and emit the right INDENT / DEDENT events.
 184 def indentation_filter(tokens):
 185     # A stack of indentation levels; will never pop item 0
 186     levels = [0]
 187     token = None
 188     depth = 0
 189     prev_was_ws = False
 190     for token in tokens:
 191         if 0:
 192             print("Process", depth, token.indent, token,)
 193             if token.at_line_start:
 194                 print("at_line_start",)
 195             if token.must_indent:
 196                 print("must_indent",)
 197             print
 198
 199         # WS only occurs at the start of the line
 200         # There may be WS followed by NEWLINE so
 201         # only track the depth here.  Don't indent/dedent
 202         # until there's something real.
 203         if token.type == "WS":
 204             assert depth == 0
 205             depth = len(token.value)
 206             prev_was_ws = True
 207             # WS tokens are never passed to the parser
 208             continue
 209
 210         if token.type == "NEWLINE":
 211             depth = 0
 212             if prev_was_ws or token.at_line_start:
 213                 # ignore blank lines
 214                 continue
 215             # pass the other cases on through
 216             yield token
 217             continue
 218
 219         # then it must be a real token (not WS, not NEWLINE)
 220         # which can affect the indentation level
 221
 222         prev_was_ws = False
 223         if token.must_indent:
 224             # The current depth must be larger than the previous level
 225             if not (depth > levels[-1]):
 226                 raise IndentationError("expected an indented block")
 227
 228             levels.append(depth)
 229             yield INDENT(token.lineno)
 230
 231         elif token.at_line_start:
 232             # Must be on the same level or one of the previous levels
 233             if depth == levels[-1]:
 234                 # At the same level
 235                 pass
 236             elif depth > levels[-1]:
 237                 raise IndentationError("indent increase but not in new block")
 238             else:
 239                 # Back up; but only if it matches a previous level
 240                 try:
 241                     i = levels.index(depth)
 242                 except ValueError:
 243                     raise IndentationError("inconsistent indentation")
 244                 for _ in range(i+1, len(levels)):
 245                     yield DEDENT(token.lineno)
 246                     levels.pop()
 247
 248         yield token
 249
 250     ### Finished processing ###
 251
 252     # Must dedent any remaining levels
 253     if len(levels) > 1:
 254         assert token is not None
 255         for _ in range(1, len(levels)):
 256             yield DEDENT(token.lineno)
 257
 258
 259 # The top-level filter adds an ENDMARKER, if requested.
 260 # Python's grammar uses it.
 261 def filter(lexer, add_endmarker=True):
 262     token = None
 263     tokens = iter(lexer.token, None)
 264     tokens = python_colonify(lexer, tokens)
 265     tokens = track_tokens_filter(lexer, tokens)
 266     for token in indentation_filter(tokens):
 267         yield token
 268
 269     if add_endmarker:
 270         lineno = 1
 271         if token is not None:
 272             lineno = token.lineno
 273         yield _new_token("ENDMARKER", lineno)
 274
 275 ##### Lexer ######
 276
 277
 278 class PowerLexer:
 279     tokens = (
 280         'DEF',
 281         'IF',
 282         'THEN',
 283         'ELSE',
 284         'FOR',
 285         'TO',
 286         'DO',
 287         'WHILE',
 288         'BREAK',
 289         'NAME',
 290         'HEX',     # hex numbers
 291         'NUMBER',  # Python decimals
 292         'BINARY',  # Python binary
 293         'STRING',  # single quoted strings only; syntax of raw strings
 294         'LPAR',
 295         'RPAR',
 296         'LBRACK',
 297         'RBRACK',
 298         'COLON',
 299         'EQ',
 300         'ASSIGNEA',
 301         'ASSIGN',
 302         'LTU',
 303         'GTU',
 304         'NE',
 305         'LE',
 306         'GE',
 307         'LT',
 308         'GT',
 309         'PLUS',
 310         'MINUS',
 311         'MULT',
 312         'DIV',
 313         'MOD',
 314         'INVERT',
 315         'APPEND',
 316         'BITOR',
 317         'BITAND',
 318         'BITXOR',
 319         'RETURN',
 320         'SWITCH',
 321         'CASE',
 322         'DEFAULT',
 323         'WS',
 324         'NEWLINE',
 325         'COMMA',
 326         'SEMICOLON',
 327         'INDENT',
 328         'DEDENT',
 329         'ENDMARKER',
 330     )
 331
 332     # Build the lexer
 333     def build(self, **kwargs):
 334         self.lexer = lex.lex(module=self, **kwargs)
 335
 336     def t_HEX(self, t):
 337         r"""0x[0-9a-fA-F_]+"""
 338         val = t.value.replace("_", "")
 339         t.value = SelectableInt(int(val, 16), (len(val)-2)*4)  # hex = nibble
 340         return t
 341
 342     def t_BINARY(self, t):
 343         r"""0b[01]+"""
 344         t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
 345         return t
 346
 347     #t_NUMBER = r'\d+'
 348     # taken from decmial.py but without the leading sign
 349     def t_NUMBER(self, t):
 350         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 351         t.value = int(t.value)
 352         return t
 353
 354     def t_STRING(self, t):
 355         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 356         print(repr(t.value))
 357         t.value = t.value[1:-1]
 358         return t
 359
 360     t_COLON = r':'
 361     t_EQ = r'='
 362     t_ASSIGNEA = r'<-iea'
 363     t_ASSIGN = r'<-'
 364     t_LTU = r'<u'
 365     t_GTU = r'>u'
 366     t_NE = r'!='
 367     t_LE = r'<='
 368     t_GE = r'>='
 369     t_LT = r'<'
 370     t_GT = r'>'
 371     t_PLUS = r'\+'
 372     t_MINUS = r'-'
 373     t_MULT = r'\*'
 374     t_DIV = r'/'
 375     t_MOD = r'%'
 376     t_INVERT = r'¬'
 377     t_COMMA = r','
 378     t_SEMICOLON = r';'
 379     t_APPEND = r'\|\|'
 380     t_BITOR = r'\|'
 381     t_BITAND = r'\&'
 382     t_BITXOR = r'\^'
 383
 384     # Ply nicely documented how to do this.
 385
 386     RESERVED = {
 387         "def": "DEF",
 388         "if": "IF",
 389         "then": "THEN",
 390         "else": "ELSE",
 391         "leave": "BREAK",
 392         "for": "FOR",
 393         "to": "TO",
 394         "while": "WHILE",
 395         "do": "DO",
 396         "return": "RETURN",
 397         "switch": "SWITCH",
 398         "case": "CASE",
 399         "default": "DEFAULT",
 400     }
 401
 402     def t_NAME(self, t):
 403         r'[a-zA-Z_][a-zA-Z0-9_]*'
 404         t.type = self.RESERVED.get(t.value, "NAME")
 405         return t
 406
 407     # Putting this before t_WS let it consume lines with only comments in
 408     # them so the latter code never sees the WS part.  Not consuming the
 409     # newline.  Needed for "if 1: #comment"
 410     def t_comment(self, t):
 411         r"[ ]*\043[^\n]*"  # \043 is '#'
 412         pass
 413
 414     # Whitespace
 415
 416     def t_WS(self, t):
 417         r'[ ]+'
 418         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 419                 t.lexer.brack_count == 0:
 420             return t
 421
 422     # Don't generate newline tokens when inside of parenthesis, eg
 423     #   a = (1,
 424     #        2, 3)
 425     def t_newline(self, t):
 426         r'\n+'
 427         t.lexer.lineno += len(t.value)
 428         t.type = "NEWLINE"
 429         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 430             return t
 431
 432     def t_LBRACK(self, t):
 433         r'\['
 434         t.lexer.brack_count += 1
 435         return t
 436
 437     def t_RBRACK(self, t):
 438         r'\]'
 439         # check for underflow?  should be the job of the parser
 440         t.lexer.brack_count -= 1
 441         return t
 442
 443     def t_LPAR(self, t):
 444         r'\('
 445         t.lexer.paren_count += 1
 446         return t
 447
 448     def t_RPAR(self, t):
 449         r'\)'
 450         # check for underflow?  should be the job of the parser
 451         t.lexer.paren_count -= 1
 452         return t
 453
 454     #t_ignore = " "
 455
 456     def t_error(self, t):
 457         raise SyntaxError("Unknown symbol %r" % (t.value[0],))
 458         print("Skipping", repr(t.value[0]))
 459         t.lexer.skip(1)
 460
 461
 462 # Combine Ply and my filters into a new lexer
 463
 464 class IndentLexer(PowerLexer):
 465     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 466         self.debug = debug
 467         self.build(debug=debug, optimize=optimize,
 468                    lextab=lextab, reflags=reflags)
 469         self.token_stream = None
 470
 471     def input(self, s, add_endmarker=True):
 472         s = annoying_case_hack_filter(s)
 473         if self.debug:
 474             print(s)
 475         s += "\n"
 476         self.lexer.paren_count = 0
 477         self.lexer.brack_count = 0
 478         self.lexer.lineno = 1
 479         self.lexer.input(s)
 480         self.token_stream = filter(self.lexer, add_endmarker)
 481
 482     def token(self):
 483         try:
 484             return next(self.token_stream)
 485         except StopIteration:
 486             return None
 487
 488
 489 switchtest = """
 490 switch (n)
 491     case(1): x <- 5
 492     case(3): x <- 2
 493     case(2):
 494
 495     case(4):
 496         x <- 3
 497     case(9):
 498
 499     default:
 500         x <- 9
 501 print (5)
 502 """
 503
 504 cnttzd = """
 505 n  <- 0
 506 do while n < 64
 507    if (RS)[63-n] = 0b1 then
 508         leave
 509    n  <- n + 1
 510 RA <- EXTZ64(n)
 511 print (RA)
 512 """
 513
 514 if __name__ == '__main__':
 515
 516     # quick test/demo
 517     #code = cnttzd
 518     code = switchtest
 519     print(code)
 520
 521     lexer = IndentLexer(debug=1)
 522     # Give the lexer some input
 523     print("code")
 524     print(code)
 525     lexer.input(code)
 526
 527     tokens = iter(lexer.token, None)
 528     for token in tokens:
 529         print(token)