src/soc/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from soc.decoder.selectable_int import SelectableInt
  14
  15 # I implemented INDENT / DEDENT generation as a post-processing filter
  16
  17 # The original lex token stream contains WS and NEWLINE characters.
  18 # WS will only occur before any other tokens on a line.
  19
  20 # I have three filters.  One tags tokens by adding two attributes.
  21 # "must_indent" is True if the token must be indented from the
  22 # previous code.  The other is "at_line_start" which is True for WS
  23 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  24 # see if the new line has changed indication level.
  25
  26 # Python's syntax has three INDENT states
  27 #  0) no colon hence no need to indent
  28 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  29 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  30 NO_INDENT = 0
  31 MAY_INDENT = 1
  32 MUST_INDENT = 2
  33
  34 # turn into python-like colon syntax from pseudo-code syntax.
  35 # identify tokens which tell us whether a "hidden colon" is needed.
  36 # this in turn means that track_tokens_filter "works" without needing
  37 # complex grammar rules
  38
  39
  40 def python_colonify(lexer, tokens):
  41
  42     implied_colon_needed = False
  43     for token in tokens:
  44         #print ("track colon token", token, token.type)
  45
  46         if token.type == 'THEN':
  47             # turn then into colon
  48             token.type = "COLON"
  49             yield token
  50         elif token.type == 'ELSE':
  51             yield token
  52             token = copy(token)
  53             token.type = "COLON"
  54             yield token
  55         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
  56             implied_colon_needed = True
  57             yield token
  58         elif token.type == 'NEWLINE':
  59             if implied_colon_needed:
  60                 ctok = copy(token)
  61                 ctok.type = "COLON"
  62                 yield ctok
  63                 implied_colon_needed = False
  64             yield token
  65         else:
  66             yield token
  67
  68
  69 # only care about whitespace at the start of a line
  70 def track_tokens_filter(lexer, tokens):
  71     oldignore = lexer.lexignore
  72     lexer.at_line_start = at_line_start = True
  73     indent = NO_INDENT
  74     saw_colon = False
  75     for token in tokens:
  76         #print ("track token", token, token.type)
  77         token.at_line_start = at_line_start
  78
  79         if token.type == "COLON":
  80             at_line_start = False
  81             indent = MAY_INDENT
  82             token.must_indent = False
  83
  84         elif token.type == "NEWLINE":
  85             at_line_start = True
  86             if indent == MAY_INDENT:
  87                 indent = MUST_INDENT
  88             token.must_indent = False
  89
  90         elif token.type == "WS":
  91             assert token.at_line_start == True
  92             at_line_start = True
  93             token.must_indent = False
  94
  95         else:
  96             # A real token; only indent after COLON NEWLINE
  97             if indent == MUST_INDENT:
  98                 token.must_indent = True
  99             else:
 100                 token.must_indent = False
 101             at_line_start = False
 102             indent = NO_INDENT
 103
 104         # really bad hack that changes ignore lexer state.
 105         # when "must indent" is seen (basically "real tokens" seen)
 106         # then ignore whitespace.
 107         if token.must_indent:
 108             lexer.lexignore = ('ignore', ' ')
 109         else:
 110             lexer.lexignore = oldignore
 111
 112         token.indent = indent
 113         yield token
 114         lexer.at_line_start = at_line_start
 115
 116
 117 def _new_token(type, lineno):
 118     tok = lex.LexToken()
 119     tok.type = type
 120     tok.value = None
 121     tok.lineno = lineno
 122     tok.lexpos = -1
 123     return tok
 124
 125 # Synthesize a DEDENT tag
 126
 127
 128 def DEDENT(lineno):
 129     return _new_token("DEDENT", lineno)
 130
 131 # Synthesize an INDENT tag
 132
 133
 134 def INDENT(lineno):
 135     return _new_token("INDENT", lineno)
 136
 137
 138 def count_spaces(l):
 139     for i in range(len(l)):
 140         if l[i] != ' ':
 141             return i
 142     return 0
 143
 144
 145 def annoying_case_hack_filter(code):
 146     """add annoying "silent keyword" (fallthrough)
 147
 148     this which tricks the parser into taking the (silent) case statement
 149     as a "small expression".  it can then be spotted and used to indicate
 150     "fall through" to the next case (in the parser)
 151
 152     also skips blank lines
 153
 154     bugs: any function that starts with the letters "case" or "default"
 155     will be detected erroneously.  fixing that involves doing a token
 156     lexer which spots the fact that "case" and "default" are words,
 157     separating them from space, colon, bracket etc.
 158
 159     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 160     """
 161     res = []
 162     prev_spc_count = None
 163     for l in code.split("\n"):
 164         spc_count = count_spaces(l)
 165         nwhite = l[spc_count:]
 166         if len(nwhite) == 0:  # skip blank lines
 167             continue
 168         if nwhite.startswith("case") or nwhite.startswith("default"):
 169             #print ("case/default", nwhite, spc_count, prev_spc_count)
 170             if (prev_spc_count is not None and
 171                 prev_spc_count == spc_count and
 172                     (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 173                 res[-1] += " fallthrough"  # add to previous line
 174             prev_spc_count = spc_count
 175         else:
 176             #print ("notstarts", spc_count, nwhite)
 177             prev_spc_count = None
 178         res.append(l)
 179     return '\n'.join(res)
 180
 181
 182 # Track the indentation level and emit the right INDENT / DEDENT events.
 183 def indentation_filter(tokens):
 184     # A stack of indentation levels; will never pop item 0
 185     levels = [0]
 186     token = None
 187     depth = 0
 188     prev_was_ws = False
 189     for token in tokens:
 190         if 0:
 191             print("Process", depth, token.indent, token,)
 192             if token.at_line_start:
 193                 print("at_line_start",)
 194             if token.must_indent:
 195                 print("must_indent",)
 196             print
 197
 198         # WS only occurs at the start of the line
 199         # There may be WS followed by NEWLINE so
 200         # only track the depth here.  Don't indent/dedent
 201         # until there's something real.
 202         if token.type == "WS":
 203             assert depth == 0
 204             depth = len(token.value)
 205             prev_was_ws = True
 206             # WS tokens are never passed to the parser
 207             continue
 208
 209         if token.type == "NEWLINE":
 210             depth = 0
 211             if prev_was_ws or token.at_line_start:
 212                 # ignore blank lines
 213                 continue
 214             # pass the other cases on through
 215             yield token
 216             continue
 217
 218         # then it must be a real token (not WS, not NEWLINE)
 219         # which can affect the indentation level
 220
 221         prev_was_ws = False
 222         if token.must_indent:
 223             # The current depth must be larger than the previous level
 224             if not (depth > levels[-1]):
 225                 raise IndentationError("expected an indented block")
 226
 227             levels.append(depth)
 228             yield INDENT(token.lineno)
 229
 230         elif token.at_line_start:
 231             # Must be on the same level or one of the previous levels
 232             if depth == levels[-1]:
 233                 # At the same level
 234                 pass
 235             elif depth > levels[-1]:
 236                 raise IndentationError("indent increase but not in new block")
 237             else:
 238                 # Back up; but only if it matches a previous level
 239                 try:
 240                     i = levels.index(depth)
 241                 except ValueError:
 242                     raise IndentationError("inconsistent indentation")
 243                 for _ in range(i+1, len(levels)):
 244                     yield DEDENT(token.lineno)
 245                     levels.pop()
 246
 247         yield token
 248
 249     ### Finished processing ###
 250
 251     # Must dedent any remaining levels
 252     if len(levels) > 1:
 253         assert token is not None
 254         for _ in range(1, len(levels)):
 255             yield DEDENT(token.lineno)
 256
 257
 258 # The top-level filter adds an ENDMARKER, if requested.
 259 # Python's grammar uses it.
 260 def filter(lexer, add_endmarker=True):
 261     token = None
 262     tokens = iter(lexer.token, None)
 263     tokens = python_colonify(lexer, tokens)
 264     tokens = track_tokens_filter(lexer, tokens)
 265     for token in indentation_filter(tokens):
 266         yield token
 267
 268     if add_endmarker:
 269         lineno = 1
 270         if token is not None:
 271             lineno = token.lineno
 272         yield _new_token("ENDMARKER", lineno)
 273
 274 ##### Lexer ######
 275
 276
 277 class PowerLexer:
 278     tokens = (
 279         'DEF',
 280         'IF',
 281         'THEN',
 282         'ELSE',
 283         'FOR',
 284         'TO',
 285         'DO',
 286         'WHILE',
 287         'BREAK',
 288         'NAME',
 289         'HEX',     # hex numbers
 290         'NUMBER',  # Python decimals
 291         'BINARY',  # Python binary
 292         'STRING',  # single quoted strings only; syntax of raw strings
 293         'LPAR',
 294         'RPAR',
 295         'LBRACK',
 296         'RBRACK',
 297         'COLON',
 298         'EQ',
 299         'ASSIGNEA',
 300         'ASSIGN',
 301         'LTU',
 302         'GTU',
 303         'NE',
 304         'LE',
 305         'GE',
 306         'LT',
 307         'GT',
 308         'PLUS',
 309         'MINUS',
 310         'MULT',
 311         'DIV',
 312         'MOD',
 313         'INVERT',
 314         'APPEND',
 315         'BITOR',
 316         'BITAND',
 317         'BITXOR',
 318         'RETURN',
 319         'SWITCH',
 320         'CASE',
 321         'DEFAULT',
 322         'WS',
 323         'NEWLINE',
 324         'COMMA',
 325         'SEMICOLON',
 326         'INDENT',
 327         'DEDENT',
 328         'ENDMARKER',
 329     )
 330
 331     # Build the lexer
 332     def build(self, **kwargs):
 333         self.lexer = lex.lex(module=self, **kwargs)
 334
 335     def t_HEX(self, t):
 336         r"""0x[0-9a-fA-F_]+"""
 337         val = t.value.replace("_", "")
 338         t.value = SelectableInt(int(val, 16), (len(val)-2)*4)  # hex = nibble
 339         return t
 340
 341     def t_BINARY(self, t):
 342         r"""0b[01]+"""
 343         t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
 344         return t
 345
 346     #t_NUMBER = r'\d+'
 347     # taken from decmial.py but without the leading sign
 348     def t_NUMBER(self, t):
 349         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 350         t.value = int(t.value)
 351         return t
 352
 353     def t_STRING(self, t):
 354         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 355         print(repr(t.value))
 356         t.value = t.value[1:-1]
 357         return t
 358
 359     t_COLON = r':'
 360     t_EQ = r'='
 361     t_ASSIGNEA = r'<-iea'
 362     t_ASSIGN = r'<-'
 363     t_LTU = r'<u'
 364     t_GTU = r'>u'
 365     t_NE = r'!='
 366     t_LE = r'<='
 367     t_GE = r'>='
 368     t_LT = r'<'
 369     t_GT = r'>'
 370     t_PLUS = r'\+'
 371     t_MINUS = r'-'
 372     t_MULT = r'\*'
 373     t_DIV = r'/'
 374     t_MOD = r'%'
 375     t_INVERT = r'¬'
 376     t_COMMA = r','
 377     t_SEMICOLON = r';'
 378     t_APPEND = r'\|\|'
 379     t_BITOR = r'\|'
 380     t_BITAND = r'\&'
 381     t_BITXOR = r'\^'
 382
 383     # Ply nicely documented how to do this.
 384
 385     RESERVED = {
 386         "def": "DEF",
 387         "if": "IF",
 388         "then": "THEN",
 389         "else": "ELSE",
 390         "leave": "BREAK",
 391         "for": "FOR",
 392         "to": "TO",
 393         "while": "WHILE",
 394         "do": "DO",
 395         "return": "RETURN",
 396         "switch": "SWITCH",
 397         "case": "CASE",
 398         "default": "DEFAULT",
 399     }
 400
 401     def t_NAME(self, t):
 402         r'[a-zA-Z_][a-zA-Z0-9_]*'
 403         t.type = self.RESERVED.get(t.value, "NAME")
 404         return t
 405
 406     # Putting this before t_WS let it consume lines with only comments in
 407     # them so the latter code never sees the WS part.  Not consuming the
 408     # newline.  Needed for "if 1: #comment"
 409     def t_comment(self, t):
 410         r"[ ]*\043[^\n]*"  # \043 is '#'
 411         pass
 412
 413     # Whitespace
 414
 415     def t_WS(self, t):
 416         r'[ ]+'
 417         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 418                 t.lexer.brack_count == 0:
 419             return t
 420
 421     # Don't generate newline tokens when inside of parenthesis, eg
 422     #   a = (1,
 423     #        2, 3)
 424     def t_newline(self, t):
 425         r'\n+'
 426         t.lexer.lineno += len(t.value)
 427         t.type = "NEWLINE"
 428         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 429             return t
 430
 431     def t_LBRACK(self, t):
 432         r'\['
 433         t.lexer.brack_count += 1
 434         return t
 435
 436     def t_RBRACK(self, t):
 437         r'\]'
 438         # check for underflow?  should be the job of the parser
 439         t.lexer.brack_count -= 1
 440         return t
 441
 442     def t_LPAR(self, t):
 443         r'\('
 444         t.lexer.paren_count += 1
 445         return t
 446
 447     def t_RPAR(self, t):
 448         r'\)'
 449         # check for underflow?  should be the job of the parser
 450         t.lexer.paren_count -= 1
 451         return t
 452
 453     #t_ignore = " "
 454
 455     def t_error(self, t):
 456         raise SyntaxError("Unknown symbol %r" % (t.value[0],))
 457         print("Skipping", repr(t.value[0]))
 458         t.lexer.skip(1)
 459
 460
 461 # Combine Ply and my filters into a new lexer
 462
 463 class IndentLexer(PowerLexer):
 464     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 465         self.debug = debug
 466         self.build(debug=debug, optimize=optimize,
 467                    lextab=lextab, reflags=reflags)
 468         self.token_stream = None
 469
 470     def input(self, s, add_endmarker=True):
 471         s = annoying_case_hack_filter(s)
 472         if self.debug:
 473             print(s)
 474         s += "\n"
 475         self.lexer.paren_count = 0
 476         self.lexer.brack_count = 0
 477         self.lexer.input(s)
 478         self.token_stream = filter(self.lexer, add_endmarker)
 479
 480     def token(self):
 481         try:
 482             return next(self.token_stream)
 483         except StopIteration:
 484             return None
 485
 486
 487 switchtest = """
 488 switch (n)
 489     case(1): x <- 5
 490     case(3): x <- 2
 491     case(2):
 492
 493     case(4):
 494         x <- 3
 495     case(9):
 496
 497     default:
 498         x <- 9
 499 print (5)
 500 """
 501
 502 cnttzd = """
 503 n  <- 0
 504 do while n < 64
 505    if (RS)[63-n] = 0b1 then
 506         leave
 507    n  <- n + 1
 508 RA <- EXTZ64(n)
 509 print (RA)
 510 """
 511
 512 if __name__ == '__main__':
 513
 514     # quick test/demo
 515     #code = cnttzd
 516     code = switchtest
 517     print(code)
 518
 519     lexer = IndentLexer(debug=1)
 520     # Give the lexer some input
 521     print("code")
 522     print(code)
 523     lexer.input(code)
 524
 525     tokens = iter(lexer.token, None)
 526     for token in tokens:
 527         print(token)