src/openpower/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from openpower.decoder.selectable_int import SelectableInt
  14
  15
  16 def raise_syntax_error(msg, filename, lineno, lexpos, input_text):
  17     line_start = input_text.rfind('\n', 0, lexpos) + 1
  18     line_end = input_text.find('\n', line_start)
  19     col = (lexpos - line_start) + 1
  20     raise SyntaxError(str(msg), (filename, lineno, col,
  21                                  input_text[line_start:line_end]))
  22
  23 # I implemented INDENT / DEDENT generation as a post-processing filter
  24
  25 # The original lex token stream contains WS and NEWLINE characters.
  26 # WS will only occur before any other tokens on a line.
  27
  28 # I have three filters.  One tags tokens by adding two attributes.
  29 # "must_indent" is True if the token must be indented from the
  30 # previous code.  The other is "at_line_start" which is True for WS
  31 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  32 # see if the new line has changed indication level.
  33
  34 # Python's syntax has three INDENT states
  35 #  0) no colon hence no need to indent
  36 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  37 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  38 NO_INDENT = 0
  39 MAY_INDENT = 1
  40 MUST_INDENT = 2
  41
  42 # turn into python-like colon syntax from pseudo-code syntax.
  43 # identify tokens which tell us whether a "hidden colon" is needed.
  44 # this in turn means that track_tokens_filter "works" without needing
  45 # complex grammar rules
  46
  47
  48 def python_colonify(lexer, tokens):
  49
  50     implied_colon_needed = False
  51     for token in tokens:
  52         #print ("track colon token", token, token.type)
  53
  54         if token.type == 'THEN':
  55             # turn then into colon
  56             token.type = "COLON"
  57             yield token
  58         elif token.type == 'ELSE':
  59             yield token
  60             token = copy(token)
  61             token.type = "COLON"
  62             yield token
  63         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
  64             implied_colon_needed = True
  65             yield token
  66         elif token.type == 'NEWLINE':
  67             if implied_colon_needed:
  68                 ctok = copy(token)
  69                 ctok.type = "COLON"
  70                 yield ctok
  71                 implied_colon_needed = False
  72             yield token
  73         else:
  74             yield token
  75
  76
  77 # only care about whitespace at the start of a line
  78 def track_tokens_filter(lexer, tokens):
  79     oldignore = lexer.lexignore
  80     lexer.at_line_start = at_line_start = True
  81     indent = NO_INDENT
  82     saw_colon = False
  83     for token in tokens:
  84         #print ("track token", token, token.type)
  85         token.at_line_start = at_line_start
  86
  87         if token.type == "COLON":
  88             at_line_start = False
  89             indent = MAY_INDENT
  90             token.must_indent = False
  91
  92         elif token.type == "NEWLINE":
  93             at_line_start = True
  94             if indent == MAY_INDENT:
  95                 indent = MUST_INDENT
  96             token.must_indent = False
  97
  98         elif token.type == "WS":
  99             assert token.at_line_start == True
 100             at_line_start = True
 101             token.must_indent = False
 102
 103         else:
 104             # A real token; only indent after COLON NEWLINE
 105             if indent == MUST_INDENT:
 106                 token.must_indent = True
 107             else:
 108                 token.must_indent = False
 109             at_line_start = False
 110             indent = NO_INDENT
 111
 112         # really bad hack that changes ignore lexer state.
 113         # when "must indent" is seen (basically "real tokens" seen)
 114         # then ignore whitespace.
 115         if token.must_indent:
 116             lexer.lexignore = ('ignore', ' ')
 117         else:
 118             lexer.lexignore = oldignore
 119
 120         token.indent = indent
 121         yield token
 122         lexer.at_line_start = at_line_start
 123
 124
 125 def _new_token(type, lineno):
 126     tok = lex.LexToken()
 127     tok.type = type
 128     tok.value = None
 129     tok.lineno = lineno
 130     tok.lexpos = -1
 131     return tok
 132
 133 # Synthesize a DEDENT tag
 134
 135
 136 def DEDENT(lineno):
 137     return _new_token("DEDENT", lineno)
 138
 139 # Synthesize an INDENT tag
 140
 141
 142 def INDENT(lineno):
 143     return _new_token("INDENT", lineno)
 144
 145
 146 def count_spaces(l):
 147     for i in range(len(l)):
 148         if l[i] != ' ':
 149             return i
 150     return 0
 151
 152
 153 def annoying_case_hack_filter(code):
 154     """add annoying "silent keyword" (fallthrough)
 155
 156     this which tricks the parser into taking the (silent) case statement
 157     as a "small expression".  it can then be spotted and used to indicate
 158     "fall through" to the next case (in the parser)
 159
 160     also skips blank lines
 161
 162     bugs: any function that starts with the letters "case" or "default"
 163     will be detected erroneously.  fixing that involves doing a token
 164     lexer which spots the fact that "case" and "default" are words,
 165     separating them from space, colon, bracket etc.
 166
 167     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 168     """
 169     res = []
 170     prev_spc_count = None
 171     for l in code.split("\n"):
 172         spc_count = count_spaces(l)
 173         nwhite = l[spc_count:]
 174         if len(nwhite) == 0:  # skip blank lines
 175             res.append('')
 176             continue
 177         if nwhite.startswith("case") or nwhite.startswith("default"):
 178             #print ("case/default", nwhite, spc_count, prev_spc_count)
 179             if (prev_spc_count is not None and
 180                 prev_spc_count == spc_count and
 181                     (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 182                 res[-1] += " fallthrough"  # add to previous line
 183             prev_spc_count = spc_count
 184         else:
 185             #print ("notstarts", spc_count, nwhite)
 186             prev_spc_count = None
 187         res.append(l)
 188     return '\n'.join(res)
 189
 190
 191 # Track the indentation level and emit the right INDENT / DEDENT events.
 192 def indentation_filter(tokens):
 193     # A stack of indentation levels; will never pop item 0
 194     levels = [0]
 195     token = None
 196     depth = 0
 197     prev_was_ws = False
 198     for token in tokens:
 199         if 0:
 200             print("Process", depth, token.indent, token,)
 201             if token.at_line_start:
 202                 print("at_line_start",)
 203             if token.must_indent:
 204                 print("must_indent",)
 205             print
 206
 207         # WS only occurs at the start of the line
 208         # There may be WS followed by NEWLINE so
 209         # only track the depth here.  Don't indent/dedent
 210         # until there's something real.
 211         if token.type == "WS":
 212             assert depth == 0
 213             depth = len(token.value)
 214             prev_was_ws = True
 215             # WS tokens are never passed to the parser
 216             continue
 217
 218         if token.type == "NEWLINE":
 219             depth = 0
 220             if prev_was_ws or token.at_line_start:
 221                 # ignore blank lines
 222                 continue
 223             # pass the other cases on through
 224             yield token
 225             continue
 226
 227         # then it must be a real token (not WS, not NEWLINE)
 228         # which can affect the indentation level
 229
 230         prev_was_ws = False
 231         if token.must_indent:
 232             # The current depth must be larger than the previous level
 233             if not (depth > levels[-1]):
 234                 raise IndentationError("expected an indented block")
 235
 236             levels.append(depth)
 237             yield INDENT(token.lineno)
 238
 239         elif token.at_line_start:
 240             # Must be on the same level or one of the previous levels
 241             if depth == levels[-1]:
 242                 # At the same level
 243                 pass
 244             elif depth > levels[-1]:
 245                 raise IndentationError("indent increase but not in new block")
 246             else:
 247                 # Back up; but only if it matches a previous level
 248                 try:
 249                     i = levels.index(depth)
 250                 except ValueError:
 251                     raise IndentationError("inconsistent indentation")
 252                 for _ in range(i+1, len(levels)):
 253                     yield DEDENT(token.lineno)
 254                     levels.pop()
 255
 256         yield token
 257
 258     ### Finished processing ###
 259
 260     # Must dedent any remaining levels
 261     if len(levels) > 1:
 262         assert token is not None
 263         for _ in range(1, len(levels)):
 264             yield DEDENT(token.lineno)
 265
 266
 267 # The top-level filter adds an ENDMARKER, if requested.
 268 # Python's grammar uses it.
 269 def filter(lexer, add_endmarker=True):
 270     token = None
 271     tokens = iter(lexer.token, None)
 272     tokens = python_colonify(lexer, tokens)
 273     tokens = track_tokens_filter(lexer, tokens)
 274     for token in indentation_filter(tokens):
 275         yield token
 276
 277     if add_endmarker:
 278         lineno = 1
 279         if token is not None:
 280             lineno = token.lineno
 281         yield _new_token("ENDMARKER", lineno)
 282
 283 ##### Lexer ######
 284
 285
 286 class PowerLexer:
 287     tokens = (
 288         'DEF',
 289         'IF',
 290         'THEN',
 291         'ELSE',
 292         'FOR',
 293         'TO',
 294         'DO',
 295         'WHILE',
 296         'BREAK',
 297         'NAME',
 298         'HEX',     # hex numbers
 299         'NUMBER',  # Python decimals
 300         'BINARY',  # Python binary
 301         'STRING',  # single quoted strings only; syntax of raw strings
 302         'LPAR',
 303         'RPAR',
 304         'LBRACK',
 305         'RBRACK',
 306         'COLON',
 307         'EQ',
 308         'ASSIGNEA',
 309         'ASSIGN',
 310         'LTU',
 311         'GTU',
 312         'NE',
 313         'LE',
 314         'GE',
 315         'LT',
 316         'GT',
 317         'PLUS',
 318         'MINUS',
 319         'MULT',
 320         'DIV',
 321         'MOD',
 322         'INVERT',
 323         'APPEND',
 324         'BITOR',
 325         'BITAND',
 326         'BITXOR',
 327         'RETURN',
 328         'SWITCH',
 329         'CASE',
 330         'DEFAULT',
 331         'WS',
 332         'NEWLINE',
 333         'COMMA',
 334         'SEMICOLON',
 335         'INDENT',
 336         'DEDENT',
 337         'ENDMARKER',
 338     )
 339
 340     # Build the lexer
 341     def build(self, **kwargs):
 342         self.lexer = lex.lex(module=self, **kwargs)
 343         self.filename = None
 344
 345     def t_HEX(self, t):
 346         r"""0x[0-9a-fA-F_]+"""
 347         val = t.value.replace("_", "")
 348         t.value = SelectableInt(int(val, 16), (len(val)-2)*4)  # hex = nibble
 349         return t
 350
 351     def t_BINARY(self, t):
 352         r"""0b[01]+"""
 353         t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
 354         return t
 355
 356     #t_NUMBER = r'\d+'
 357     # taken from decmial.py but without the leading sign
 358     def t_NUMBER(self, t):
 359         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 360         t.value = int(t.value)
 361         return t
 362
 363     def t_STRING(self, t):
 364         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 365         print(repr(t.value))
 366         t.value = t.value[1:-1]
 367         return t
 368
 369     t_COLON = r':'
 370     t_EQ = r'='
 371     t_ASSIGNEA = r'<-iea'
 372     t_ASSIGN = r'<-'
 373     t_LTU = r'<u'
 374     t_GTU = r'>u'
 375     t_NE = r'!='
 376     t_LE = r'<='
 377     t_GE = r'>='
 378     t_LT = r'<'
 379     t_GT = r'>'
 380     t_PLUS = r'\+'
 381     t_MINUS = r'-'
 382     t_MULT = r'\*'
 383     t_DIV = r'/'
 384     t_MOD = r'%'
 385     t_INVERT = r'¬'
 386     t_COMMA = r','
 387     t_SEMICOLON = r';'
 388     t_APPEND = r'\|\|'
 389     t_BITOR = r'\|'
 390     t_BITAND = r'\&'
 391     t_BITXOR = r'\^'
 392
 393     # Ply nicely documented how to do this.
 394
 395     RESERVED = {
 396         "def": "DEF",
 397         "if": "IF",
 398         "then": "THEN",
 399         "else": "ELSE",
 400         "leave": "BREAK",
 401         "for": "FOR",
 402         "to": "TO",
 403         "while": "WHILE",
 404         "do": "DO",
 405         "return": "RETURN",
 406         "switch": "SWITCH",
 407         "case": "CASE",
 408         "default": "DEFAULT",
 409     }
 410
 411     def t_NAME(self, t):
 412         r'[a-zA-Z_][a-zA-Z0-9_]*'
 413         t.type = self.RESERVED.get(t.value, "NAME")
 414         return t
 415
 416     # Putting this before t_WS let it consume lines with only comments in
 417     # them so the latter code never sees the WS part.  Not consuming the
 418     # newline.  Needed for "if 1: #comment"
 419     def t_comment(self, t):
 420         r"[ ]*\043[^\n]*"  # \043 is '#'
 421         pass
 422
 423     # Whitespace
 424
 425     def t_WS(self, t):
 426         r'[ ]+'
 427         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 428                 t.lexer.brack_count == 0:
 429             return t
 430
 431     # Don't generate newline tokens when inside of parenthesis, eg
 432     #   a = (1,
 433     #        2, 3)
 434     def t_newline(self, t):
 435         r'\n+'
 436         t.lexer.lineno += len(t.value)
 437         t.type = "NEWLINE"
 438         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 439             return t
 440
 441     def t_LBRACK(self, t):
 442         r'\['
 443         t.lexer.brack_count += 1
 444         return t
 445
 446     def t_RBRACK(self, t):
 447         r'\]'
 448         # check for underflow?  should be the job of the parser
 449         t.lexer.brack_count -= 1
 450         return t
 451
 452     def t_LPAR(self, t):
 453         r'\('
 454         t.lexer.paren_count += 1
 455         return t
 456
 457     def t_RPAR(self, t):
 458         r'\)'
 459         # check for underflow?  should be the job of the parser
 460         t.lexer.paren_count -= 1
 461         return t
 462
 463     #t_ignore = " "
 464
 465     def t_error(self, t):
 466         raise_syntax_error("Unknown symbol %r" % (t.value[0],),
 467                            self.filename, t.lexer.lineno,
 468                            t.lexer.lexpos, t.lexer.lexdata)
 469         print("Skipping", repr(t.value[0]))
 470         t.lexer.skip(1)
 471
 472
 473 # Combine Ply and my filters into a new lexer
 474
 475 class IndentLexer(PowerLexer):
 476     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 477         self.debug = debug
 478         self.build(debug=debug, optimize=optimize,
 479                    lextab=lextab, reflags=reflags)
 480         self.token_stream = None
 481
 482     def input(self, s, add_endmarker=True):
 483         s = annoying_case_hack_filter(s)
 484         if self.debug:
 485             print(s)
 486         s += "\n"
 487         self.lexer.paren_count = 0
 488         self.lexer.brack_count = 0
 489         self.lexer.lineno = 1
 490         self.lexer.input(s)
 491         self.token_stream = filter(self.lexer, add_endmarker)
 492
 493     def token(self):
 494         try:
 495             return next(self.token_stream)
 496         except StopIteration:
 497             return None
 498
 499
 500 switchtest = """
 501 switch (n)
 502     case(1): x <- 5
 503     case(3): x <- 2
 504     case(2):
 505
 506     case(4):
 507         x <- 3
 508     case(9):
 509
 510     default:
 511         x <- 9
 512 print (5)
 513 """
 514
 515 cnttzd = """
 516 n  <- 0
 517 do while n < 64
 518    if (RS)[63-n] = 0b1 then
 519         leave
 520    n  <- n + 1
 521 RA <- EXTZ64(n)
 522 print (RA)
 523 """
 524
 525 if __name__ == '__main__':
 526
 527     # quick test/demo
 528     #code = cnttzd
 529     code = switchtest
 530     print(code)
 531
 532     lexer = IndentLexer(debug=1)
 533     # Give the lexer some input
 534     print("code")
 535     print(code)
 536     lexer.input(code)
 537
 538     tokens = iter(lexer.token, None)
 539     for token in tokens:
 540         print(token)