src/openpower/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from openpower.decoder.selectable_int import SelectableInt
  14
  15
  16 class SyntaxError2(Exception):
  17     """ class used to raise a syntax error but get ply to stop eating errors
  18     since it catches and discards SyntaxError after setting a flag.
  19     """
  20
  21     def __init__(self, *args, cls=SyntaxError):
  22         super().__init__(*args)
  23         self.cls = cls
  24
  25     def __repr__(self):
  26         return repr(self.cls(*self.args))
  27
  28     def __str__(self):
  29         return str(self.cls(*self.args))
  30
  31     def raise_syntax_error(self):
  32         raise self.cls(*self.args) from self
  33
  34
  35 def raise_syntax_error(msg, filename, lineno, lexpos, input_text,
  36                        cls=SyntaxError):
  37     line_start = input_text.rfind('\n', 0, lexpos) + 1
  38     line_end = input_text.find('\n', line_start)
  39     col = (lexpos - line_start) + 1
  40     raise SyntaxError2(str(msg), (filename, lineno, col,
  41                                   input_text[line_start:line_end]), cls=cls)
  42
  43 # I implemented INDENT / DEDENT generation as a post-processing filter
  44
  45 # The original lex token stream contains WS and NEWLINE characters.
  46 # WS will only occur before any other tokens on a line.
  47
  48 # I have three filters.  One tags tokens by adding two attributes.
  49 # "must_indent" is True if the token must be indented from the
  50 # previous code.  The other is "at_line_start" which is True for WS
  51 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  52 # see if the new line has changed indication level.
  53
  54 # Python's syntax has three INDENT states
  55 #  0) no colon hence no need to indent
  56 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  57 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  58 NO_INDENT = 0
  59 MAY_INDENT = 1
  60 MUST_INDENT = 2
  61
  62 # turn into python-like colon syntax from pseudo-code syntax.
  63 # identify tokens which tell us whether a "hidden colon" is needed.
  64 # this in turn means that track_tokens_filter "works" without needing
  65 # complex grammar rules
  66
  67
  68 def python_colonify(lexer, tokens):
  69
  70     implied_colon_needed = False
  71     for token in tokens:
  72         #print ("track colon token", token, token.type)
  73
  74         if token.type == 'THEN':
  75             # turn then into colon
  76             token.type = "COLON"
  77             yield token
  78         elif token.type == 'ELSE':
  79             yield token
  80             token = copy(token)
  81             token.type = "COLON"
  82             yield token
  83         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
  84             implied_colon_needed = True
  85             yield token
  86         elif token.type == 'NEWLINE':
  87             if implied_colon_needed:
  88                 ctok = copy(token)
  89                 ctok.type = "COLON"
  90                 yield ctok
  91                 implied_colon_needed = False
  92             yield token
  93         else:
  94             yield token
  95
  96
  97 # only care about whitespace at the start of a line
  98 def track_tokens_filter(lexer, tokens):
  99     oldignore = lexer.lexignore
 100     lexer.at_line_start = at_line_start = True
 101     indent = NO_INDENT
 102     saw_colon = False
 103     for token in tokens:
 104         #print ("track token", token, token.type)
 105         token.at_line_start = at_line_start
 106
 107         if token.type == "COLON":
 108             at_line_start = False
 109             indent = MAY_INDENT
 110             token.must_indent = False
 111
 112         elif token.type == "NEWLINE":
 113             at_line_start = True
 114             if indent == MAY_INDENT:
 115                 indent = MUST_INDENT
 116             token.must_indent = False
 117
 118         elif token.type == "WS":
 119             assert token.at_line_start == True
 120             at_line_start = True
 121             token.must_indent = False
 122
 123         else:
 124             # A real token; only indent after COLON NEWLINE
 125             if indent == MUST_INDENT:
 126                 token.must_indent = True
 127             else:
 128                 token.must_indent = False
 129             at_line_start = False
 130             indent = NO_INDENT
 131
 132         # really bad hack that changes ignore lexer state.
 133         # when "must indent" is seen (basically "real tokens" seen)
 134         # then ignore whitespace.
 135         if token.must_indent:
 136             lexer.lexignore = ('ignore', ' ')
 137         else:
 138             lexer.lexignore = oldignore
 139
 140         token.indent = indent
 141         yield token
 142         lexer.at_line_start = at_line_start
 143
 144
 145 def _new_token(type, lineno):
 146     tok = lex.LexToken()
 147     tok.type = type
 148     tok.value = None
 149     tok.lineno = lineno
 150     tok.lexpos = -1
 151     return tok
 152
 153 # Synthesize a DEDENT tag
 154
 155
 156 def DEDENT(lineno):
 157     return _new_token("DEDENT", lineno)
 158
 159 # Synthesize an INDENT tag
 160
 161
 162 def INDENT(lineno):
 163     return _new_token("INDENT", lineno)
 164
 165
 166 def count_spaces(l):
 167     for i in range(len(l)):
 168         if l[i] != ' ':
 169             return i
 170     return 0
 171
 172
 173 def annoying_case_hack_filter(code):
 174     """add annoying "silent keyword" (fallthrough)
 175
 176     this which tricks the parser into taking the (silent) case statement
 177     as a "small expression".  it can then be spotted and used to indicate
 178     "fall through" to the next case (in the parser)
 179
 180     also skips blank lines
 181
 182     bugs: any function that starts with the letters "case" or "default"
 183     will be detected erroneously.  fixing that involves doing a token
 184     lexer which spots the fact that "case" and "default" are words,
 185     separating them from space, colon, bracket etc.
 186
 187     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 188     """
 189     res = []
 190     prev_spc_count = None
 191     for l in code.split("\n"):
 192         spc_count = count_spaces(l)
 193         nwhite = l[spc_count:]
 194         if len(nwhite) == 0:  # skip blank lines
 195             res.append('')
 196             continue
 197         if nwhite.startswith("case") or nwhite.startswith("default"):
 198             #print ("case/default", nwhite, spc_count, prev_spc_count)
 199             if (prev_spc_count is not None and
 200                 prev_spc_count == spc_count and
 201                     (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 202                 res[-1] += " fallthrough"  # add to previous line
 203             prev_spc_count = spc_count
 204         else:
 205             #print ("notstarts", spc_count, nwhite)
 206             prev_spc_count = None
 207         res.append(l)
 208     return '\n'.join(res)
 209
 210
 211 # Track the indentation level and emit the right INDENT / DEDENT events.
 212 def indentation_filter(tokens, filename):
 213     # A stack of indentation levels; will never pop item 0
 214     levels = [0]
 215     token = None
 216     depth = 0
 217     prev_was_ws = False
 218     for token in tokens:
 219         if 0:
 220             print("Process", depth, token.indent, token,)
 221             if token.at_line_start:
 222                 print("at_line_start",)
 223             if token.must_indent:
 224                 print("must_indent",)
 225             print
 226
 227         # WS only occurs at the start of the line
 228         # There may be WS followed by NEWLINE so
 229         # only track the depth here.  Don't indent/dedent
 230         # until there's something real.
 231         if token.type == "WS":
 232             assert depth == 0
 233             depth = len(token.value)
 234             prev_was_ws = True
 235             # WS tokens are never passed to the parser
 236             continue
 237
 238         if token.type == "NEWLINE":
 239             depth = 0
 240             if prev_was_ws or token.at_line_start:
 241                 # ignore blank lines
 242                 continue
 243             # pass the other cases on through
 244             yield token
 245             continue
 246
 247         # then it must be a real token (not WS, not NEWLINE)
 248         # which can affect the indentation level
 249
 250         prev_was_ws = False
 251         if token.must_indent:
 252             # The current depth must be larger than the previous level
 253             if not (depth > levels[-1]):
 254                 raise_syntax_error("expected an indented block",
 255                                    filename, token.lexer.lineno,
 256                                    token.lexer.lexpos, token.lexer.lexdata,
 257                                    cls=IndentationError)
 258
 259             levels.append(depth)
 260             yield INDENT(token.lineno)
 261
 262         elif token.at_line_start:
 263             # Must be on the same level or one of the previous levels
 264             if depth == levels[-1]:
 265                 # At the same level
 266                 pass
 267             elif depth > levels[-1]:
 268                 raise_syntax_error("indent increase but not in new block",
 269                                    filename, token.lexer.lineno,
 270                                    token.lexer.lexpos, token.lexer.lexdata,
 271                                    cls=IndentationError)
 272             else:
 273                 # Back up; but only if it matches a previous level
 274                 try:
 275                     i = levels.index(depth)
 276                 except ValueError:
 277                     raise_syntax_error("inconsistent indentation",
 278                                        filename, token.lexer.lineno,
 279                                        token.lexer.lexpos, token.lexer.lexdata,
 280                                        cls=IndentationError)
 281                 for _ in range(i+1, len(levels)):
 282                     yield DEDENT(token.lineno)
 283                     levels.pop()
 284
 285         yield token
 286
 287     ### Finished processing ###
 288
 289     # Must dedent any remaining levels
 290     if len(levels) > 1:
 291         assert token is not None
 292         for _ in range(1, len(levels)):
 293             yield DEDENT(token.lineno)
 294
 295
 296 # The top-level filter adds an ENDMARKER, if requested.
 297 # Python's grammar uses it.
 298 def filter(lexer, add_endmarker, filename):
 299     token = None
 300     tokens = iter(lexer.token, None)
 301     tokens = python_colonify(lexer, tokens)
 302     tokens = track_tokens_filter(lexer, tokens)
 303     for token in indentation_filter(tokens, filename):
 304         yield token
 305
 306     if add_endmarker:
 307         lineno = 1
 308         if token is not None:
 309             lineno = token.lineno
 310         yield _new_token("ENDMARKER", lineno)
 311
 312
 313 KEYWORD_REPLACEMENTS = {'class': 'class_'}
 314
 315 ##### Lexer ######
 316
 317
 318 class PowerLexer:
 319     tokens = (
 320         'DEF',
 321         'IF',
 322         'THEN',
 323         'ELSE',
 324         'FOR',
 325         'TO',
 326         'DO',
 327         'WHILE',
 328         'BREAK',
 329         'NAME',
 330         'HEX',     # hex numbers
 331         'NUMBER',  # Python decimals
 332         'BINARY',  # Python binary
 333         'STRING',  # single quoted strings only; syntax of raw strings
 334         'LPAR',
 335         'RPAR',
 336         'LBRACK',
 337         'RBRACK',
 338         'COLON',
 339         'EQ',
 340         'ASSIGNEA',
 341         'ASSIGN',
 342         'LTU',
 343         'GTU',
 344         'NE',
 345         'LE',
 346         'LSHIFT',
 347         'RSHIFT',
 348         'GE',
 349         'LT',
 350         'GT',
 351         'PLUS',
 352         'MINUS',
 353         'MULT',
 354         'DIV',
 355         'MOD',
 356         'INVERT',
 357         'APPEND',
 358         'BITOR',
 359         'BITAND',
 360         'BITXOR',
 361         'RETURN',
 362         'SWITCH',
 363         'CASE',
 364         'DEFAULT',
 365         'WS',
 366         'NEWLINE',
 367         'COMMA',
 368         'QMARK',
 369         'PERIOD',
 370         'SEMICOLON',
 371         'INDENT',
 372         'DEDENT',
 373         'ENDMARKER',
 374     )
 375
 376     # Build the lexer
 377     def build(self, **kwargs):
 378         self.lexer = lex.lex(module=self, **kwargs)
 379         self.filename = None
 380
 381     def t_HEX(self, t):
 382         r"""0x[0-9a-fA-F_]+"""
 383         val = t.value.replace("_", "")
 384         t.value = SelectableInt(int(val, 16), (len(val)-2)*4)  # hex = nibble
 385         return t
 386
 387     def t_BINARY(self, t):
 388         r"""0b[01_]+"""
 389         val = t.value.replace("_", "")
 390         t.value = SelectableInt(int(val, 2), len(val)-2)
 391         return t
 392
 393     #t_NUMBER = r'\d+'
 394     # taken from decmial.py but without the leading sign
 395     def t_NUMBER(self, t):
 396         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 397         t.value = int(t.value)
 398         return t
 399
 400     def t_STRING(self, t):
 401         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 402         print(repr(t.value))
 403         t.value = t.value[1:-1]
 404         return t
 405
 406     t_COLON = r':'
 407     t_EQ = r'='
 408     t_ASSIGNEA = r'<-iea'
 409     t_ASSIGN = r'<-'
 410     t_LTU = r'<u'
 411     t_GTU = r'>u'
 412     t_NE = r'!='
 413     t_LE = r'<='
 414     t_GE = r'>='
 415     t_LSHIFT = r'<<'
 416     t_RSHIFT = r'>>'
 417     t_LT = r'<'
 418     t_GT = r'>'
 419     t_PLUS = r'\+'
 420     t_MINUS = r'-'
 421     t_MULT = r'\*'
 422     t_DIV = r'/'
 423     t_MOD = r'%'
 424     t_INVERT = r'¬'
 425     t_COMMA = r','
 426     t_PERIOD = r'.'
 427     t_SEMICOLON = r';'
 428     t_APPEND = r'\|\|'
 429     t_BITOR = r'\|'
 430     t_BITAND = r'\&'
 431     t_BITXOR = r'\^'
 432     t_QMARK = r'\?'
 433
 434     # Ply nicely documented how to do this.
 435
 436     RESERVED = {
 437         "def": "DEF",
 438         "if": "IF",
 439         "then": "THEN",
 440         "else": "ELSE",
 441         "leave": "BREAK",
 442         "for": "FOR",
 443         "to": "TO",
 444         "while": "WHILE",
 445         "do": "DO",
 446         "return": "RETURN",
 447         "switch": "SWITCH",
 448         "case": "CASE",
 449         "default": "DEFAULT",
 450     }
 451
 452     def t_NAME(self, t):
 453         r'[a-zA-Z_][a-zA-Z0-9_]*'
 454         t.type = self.RESERVED.get(t.value, "NAME")
 455         if t.value in KEYWORD_REPLACEMENTS:
 456             t.value = KEYWORD_REPLACEMENTS[t.value]
 457         return t
 458
 459     # Putting this before t_WS let it consume lines with only comments in
 460     # them so the latter code never sees the WS part.  Not consuming the
 461     # newline.  Needed for "if 1: #comment"
 462     def t_comment(self, t):
 463         r"[ ]*\043[^\n]*"  # \043 is '#'
 464         pass
 465
 466     # Whitespace
 467
 468     def t_WS(self, t):
 469         r'[ ]+'
 470         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 471                 t.lexer.brack_count == 0:
 472             return t
 473
 474     # Don't generate newline tokens when inside of parenthesis, eg
 475     #   a = (1,
 476     #        2, 3)
 477     def t_newline(self, t):
 478         r'\n+'
 479         t.lexer.lineno += len(t.value)
 480         t.type = "NEWLINE"
 481         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 482             return t
 483
 484     def t_LBRACK(self, t):
 485         r'\['
 486         t.lexer.brack_count += 1
 487         return t
 488
 489     def t_RBRACK(self, t):
 490         r'\]'
 491         # check for underflow?  should be the job of the parser
 492         t.lexer.brack_count -= 1
 493         return t
 494
 495     def t_LPAR(self, t):
 496         r'\('
 497         t.lexer.paren_count += 1
 498         return t
 499
 500     def t_RPAR(self, t):
 501         r'\)'
 502         # check for underflow?  should be the job of the parser
 503         t.lexer.paren_count -= 1
 504         return t
 505
 506     #t_ignore = " "
 507
 508     def t_error(self, t):
 509         raise_syntax_error("Unknown symbol %r" % (t.value[0],),
 510                            self.filename, t.lexer.lineno,
 511                            t.lexer.lexpos, t.lexer.lexdata)
 512         print("Skipping", repr(t.value[0]))
 513         t.lexer.skip(1)
 514
 515
 516 # Combine Ply and my filters into a new lexer
 517
 518 class IndentLexer(PowerLexer):
 519     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 520         self.debug = debug
 521         self.build(debug=debug, optimize=optimize,
 522                    lextab=lextab, reflags=reflags)
 523         self.token_stream = None
 524
 525     def input(self, s, add_endmarker=True):
 526         s = annoying_case_hack_filter(s)
 527         if self.debug:
 528             print(s)
 529         s += "\n"
 530         self.lexer.paren_count = 0
 531         self.lexer.brack_count = 0
 532         self.lexer.lineno = 1
 533         self.lexer.input(s)
 534         self.token_stream = filter(self.lexer, add_endmarker, self.filename)
 535
 536     def token(self):
 537         try:
 538             return next(self.token_stream)
 539         except StopIteration:
 540             return None
 541
 542
 543 switchtest = """
 544 switch (n)
 545     case(1): x <- 5
 546     case(3): x <- 2
 547     case(2):
 548
 549     case(4):
 550         x <- 3
 551     case(9):
 552
 553     default:
 554         x <- 9
 555 print (5)
 556 """
 557
 558 cnttzd = """
 559 n  <- 0
 560 do while n < 64
 561    if (RS)[63-n] = 0b1 then
 562         leave
 563    n  <- n + 1
 564 RA <- EXTZ64(n)
 565 print (RA)
 566 """
 567
 568 if __name__ == '__main__':
 569
 570     # quick test/demo
 571     #code = cnttzd
 572     code = switchtest
 573     print(code)
 574
 575     lexer = IndentLexer(debug=1)
 576     # Give the lexer some input
 577     print("code")
 578     print(code)
 579     lexer.input(code)
 580
 581     tokens = iter(lexer.token, None)
 582     for token in tokens:
 583         print(token)