src/soc/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from soc.decoder.selectable_int import SelectableInt
  14
  15 ## I implemented INDENT / DEDENT generation as a post-processing filter
  16
  17 # The original lex token stream contains WS and NEWLINE characters.
  18 # WS will only occur before any other tokens on a line.
  19
  20 # I have three filters.  One tags tokens by adding two attributes.
  21 # "must_indent" is True if the token must be indented from the
  22 # previous code.  The other is "at_line_start" which is True for WS
  23 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  24 # see if the new line has changed indication level.
  25
  26 # Python's syntax has three INDENT states
  27 #  0) no colon hence no need to indent
  28 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  29 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  30 NO_INDENT = 0
  31 MAY_INDENT = 1
  32 MUST_INDENT = 2
  33
  34 # turn into python-like colon syntax from pseudo-code syntax.
  35 # identify tokens which tell us whether a "hidden colon" is needed.
  36 # this in turn means that track_tokens_filter "works" without needing
  37 # complex grammar rules
  38 def python_colonify(lexer, tokens):
  39
  40     implied_colon_needed = False
  41     for token in tokens:
  42         #print ("track colon token", token, token.type)
  43
  44         if token.type == 'THEN':
  45             # turn then into colon
  46             token.type = "COLON"
  47             yield token
  48         elif token.type == 'ELSE':
  49             yield token
  50             token = copy(token)
  51             token.type = "COLON"
  52             yield token
  53         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
  54             implied_colon_needed = True
  55             yield token
  56         elif token.type == 'NEWLINE':
  57             if implied_colon_needed:
  58                 ctok = copy(token)
  59                 ctok.type = "COLON"
  60                 yield ctok
  61                 implied_colon_needed = False
  62             yield token
  63         else:
  64             yield token
  65
  66
  67 # only care about whitespace at the start of a line
  68 def track_tokens_filter(lexer, tokens):
  69     oldignore = lexer.lexignore
  70     lexer.at_line_start = at_line_start = True
  71     indent = NO_INDENT
  72     saw_colon = False
  73     for token in tokens:
  74         #print ("track token", token, token.type)
  75         token.at_line_start = at_line_start
  76
  77         if token.type == "COLON":
  78             at_line_start = False
  79             indent = MAY_INDENT
  80             token.must_indent = False
  81
  82         elif token.type == "NEWLINE":
  83             at_line_start = True
  84             if indent == MAY_INDENT:
  85                 indent = MUST_INDENT
  86             token.must_indent = False
  87
  88         elif token.type == "WS":
  89             assert token.at_line_start == True
  90             at_line_start = True
  91             token.must_indent = False
  92
  93         else:
  94             # A real token; only indent after COLON NEWLINE
  95             if indent == MUST_INDENT:
  96                 token.must_indent = True
  97             else:
  98                 token.must_indent = False
  99             at_line_start = False
 100             indent = NO_INDENT
 101
 102         # really bad hack that changes ignore lexer state.
 103         # when "must indent" is seen (basically "real tokens" seen)
 104         # then ignore whitespace.
 105         if token.must_indent:
 106             lexer.lexignore = ('ignore', ' ')
 107         else:
 108             lexer.lexignore = oldignore
 109
 110         token.indent = indent
 111         yield token
 112         lexer.at_line_start = at_line_start
 113
 114 def _new_token(type, lineno):
 115     tok = lex.LexToken()
 116     tok.type = type
 117     tok.value = None
 118     tok.lineno = lineno
 119     tok.lexpos = -1
 120     return tok
 121
 122 # Synthesize a DEDENT tag
 123 def DEDENT(lineno):
 124     return _new_token("DEDENT", lineno)
 125
 126 # Synthesize an INDENT tag
 127 def INDENT(lineno):
 128     return _new_token("INDENT", lineno)
 129
 130 def count_spaces(l):
 131     for i in range(len(l)):
 132         if l[i] != ' ':
 133             return i
 134     return 0
 135
 136 def annoying_case_hack_filter(code):
 137     """add annoying "silent keyword" (fallthrough)
 138
 139     this which tricks the parser into taking the (silent) case statement
 140     as a "small expression".  it can then be spotted and used to indicate
 141     "fall through" to the next case (in the parser)
 142
 143     also skips blank lines
 144
 145     bugs: any function that starts with the letters "case" or "default"
 146     will be detected erroneously.  fixing that involves doing a token
 147     lexer which spots the fact that "case" and "default" are words,
 148     separating them from space, colon, bracket etc.
 149
 150     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 151     """
 152     res = []
 153     prev_spc_count = None
 154     for l in code.split("\n"):
 155         spc_count = count_spaces(l)
 156         nwhite = l[spc_count:]
 157         if len(nwhite) == 0: # skip blank lines
 158             continue
 159         if nwhite.startswith("case") or nwhite.startswith("default"):
 160             #print ("case/default", nwhite, spc_count, prev_spc_count)
 161             if (prev_spc_count is not None and
 162                 prev_spc_count == spc_count and
 163                 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 164                 res[-1] += " fallthrough" # add to previous line
 165             prev_spc_count = spc_count
 166         else:
 167             #print ("notstarts", spc_count, nwhite)
 168             prev_spc_count = None
 169         res.append(l)
 170     return '\n'.join(res)
 171
 172
 173 # Track the indentation level and emit the right INDENT / DEDENT events.
 174 def indentation_filter(tokens):
 175     # A stack of indentation levels; will never pop item 0
 176     levels = [0]
 177     token = None
 178     depth = 0
 179     prev_was_ws = False
 180     for token in tokens:
 181         if 0:
 182             print ("Process", depth, token.indent, token,)
 183             if token.at_line_start:
 184                 print ("at_line_start",)
 185             if token.must_indent:
 186                 print ("must_indent",)
 187             print
 188
 189         # WS only occurs at the start of the line
 190         # There may be WS followed by NEWLINE so
 191         # only track the depth here.  Don't indent/dedent
 192         # until there's something real.
 193         if token.type == "WS":
 194             assert depth == 0
 195             depth = len(token.value)
 196             prev_was_ws = True
 197             # WS tokens are never passed to the parser
 198             continue
 199
 200         if token.type == "NEWLINE":
 201             depth = 0
 202             if prev_was_ws or token.at_line_start:
 203                 # ignore blank lines
 204                 continue
 205             # pass the other cases on through
 206             yield token
 207             continue
 208
 209         # then it must be a real token (not WS, not NEWLINE)
 210         # which can affect the indentation level
 211
 212         prev_was_ws = False
 213         if token.must_indent:
 214             # The current depth must be larger than the previous level
 215             if not (depth > levels[-1]):
 216                 raise IndentationError("expected an indented block")
 217
 218             levels.append(depth)
 219             yield INDENT(token.lineno)
 220
 221         elif token.at_line_start:
 222             # Must be on the same level or one of the previous levels
 223             if depth == levels[-1]:
 224                 # At the same level
 225                 pass
 226             elif depth > levels[-1]:
 227                 raise IndentationError("indent increase but not in new block")
 228             else:
 229                 # Back up; but only if it matches a previous level
 230                 try:
 231                     i = levels.index(depth)
 232                 except ValueError:
 233                     raise IndentationError("inconsistent indentation")
 234                 for _ in range(i+1, len(levels)):
 235                     yield DEDENT(token.lineno)
 236                     levels.pop()
 237
 238         yield token
 239
 240     ### Finished processing ###
 241
 242     # Must dedent any remaining levels
 243     if len(levels) > 1:
 244         assert token is not None
 245         for _ in range(1, len(levels)):
 246             yield DEDENT(token.lineno)
 247
 248
 249 # The top-level filter adds an ENDMARKER, if requested.
 250 # Python's grammar uses it.
 251 def filter(lexer, add_endmarker = True):
 252     token = None
 253     tokens = iter(lexer.token, None)
 254     tokens = python_colonify(lexer, tokens)
 255     tokens = track_tokens_filter(lexer, tokens)
 256     for token in indentation_filter(tokens):
 257         yield token
 258
 259     if add_endmarker:
 260         lineno = 1
 261         if token is not None:
 262             lineno = token.lineno
 263         yield _new_token("ENDMARKER", lineno)
 264
 265 ##### Lexer ######
 266
 267 class PowerLexer:
 268     tokens = (
 269         'DEF',
 270         'IF',
 271         'THEN',
 272         'ELSE',
 273         'FOR',
 274         'TO',
 275         'DO',
 276         'WHILE',
 277         'BREAK',
 278         'NAME',
 279         'HEX',     # hex numbers
 280         'NUMBER',  # Python decimals
 281         'BINARY',  # Python binary
 282         'STRING',  # single quoted strings only; syntax of raw strings
 283         'LPAR',
 284         'RPAR',
 285         'LBRACK',
 286         'RBRACK',
 287         'COLON',
 288         'EQ',
 289         'ASSIGNEA',
 290         'ASSIGN',
 291         'LTU',
 292         'GTU',
 293         'NE',
 294         'LE',
 295         'GE',
 296         'LT',
 297         'GT',
 298         'PLUS',
 299         'MINUS',
 300         'MULT',
 301         'DIV',
 302         'MOD',
 303         'INVERT',
 304         'APPEND',
 305         'BITOR',
 306         'BITAND',
 307         'BITXOR',
 308         'RETURN',
 309         'SWITCH',
 310         'CASE',
 311         'DEFAULT',
 312         'WS',
 313         'NEWLINE',
 314         'COMMA',
 315         'SEMICOLON',
 316         'INDENT',
 317         'DEDENT',
 318         'ENDMARKER',
 319         )
 320
 321     # Build the lexer
 322     def build(self,**kwargs):
 323          self.lexer = lex.lex(module=self, **kwargs)
 324
 325     def t_HEX(self, t):
 326         r"""0x[0-9a-fA-F_]+"""
 327         val = t.value.replace("_", "")
 328         t.value = SelectableInt(int(val, 16), (len(val)-2)*16)
 329         return t
 330
 331     def t_BINARY(self, t):
 332         r"""0b[01]+"""
 333         t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
 334         return t
 335
 336     #t_NUMBER = r'\d+'
 337     # taken from decmial.py but without the leading sign
 338     def t_NUMBER(self, t):
 339         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 340         t.value = int(t.value)
 341         return t
 342
 343     def t_STRING(self, t):
 344         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 345         print (repr(t.value))
 346         t.value=t.value[1:-1]
 347         return t
 348
 349     t_COLON = r':'
 350     t_EQ = r'='
 351     t_ASSIGNEA = r'<-iea'
 352     t_ASSIGN = r'<-'
 353     t_LTU = r'<u'
 354     t_GTU = r'>u'
 355     t_NE = r'!='
 356     t_LE = r'<='
 357     t_GE = r'>='
 358     t_LT = r'<'
 359     t_GT = r'>'
 360     t_PLUS = r'\+'
 361     t_MINUS = r'-'
 362     t_MULT = r'\*'
 363     t_DIV = r'/'
 364     t_MOD = r'%'
 365     t_INVERT = r'¬'
 366     t_COMMA = r','
 367     t_SEMICOLON = r';'
 368     t_APPEND = r'\|\|'
 369     t_BITOR = r'\|'
 370     t_BITAND = r'\&'
 371     t_BITXOR = r'\^'
 372
 373     # Ply nicely documented how to do this.
 374
 375     RESERVED = {
 376       "def": "DEF",
 377       "if": "IF",
 378       "then": "THEN",
 379       "else": "ELSE",
 380       "leave": "BREAK",
 381       "for": "FOR",
 382       "to": "TO",
 383       "while": "WHILE",
 384       "do": "DO",
 385       "return": "RETURN",
 386       "switch": "SWITCH",
 387       "case": "CASE",
 388       "default": "DEFAULT",
 389       }
 390
 391     def t_NAME(self, t):
 392         r'[a-zA-Z_][a-zA-Z0-9_]*'
 393         t.type = self.RESERVED.get(t.value, "NAME")
 394         return t
 395
 396     # Putting this before t_WS let it consume lines with only comments in
 397     # them so the latter code never sees the WS part.  Not consuming the
 398     # newline.  Needed for "if 1: #comment"
 399     def t_comment(self, t):
 400         r"[ ]*\043[^\n]*"  # \043 is '#'
 401         pass
 402
 403
 404     # Whitespace
 405     def t_WS(self, t):
 406         r'[ ]+'
 407         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 408                                      t.lexer.brack_count == 0:
 409             return t
 410
 411     # Don't generate newline tokens when inside of parenthesis, eg
 412     #   a = (1,
 413     #        2, 3)
 414     def t_newline(self, t):
 415         r'\n+'
 416         t.lexer.lineno += len(t.value)
 417         t.type = "NEWLINE"
 418         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 419             return t
 420
 421     def t_LBRACK(self, t):
 422         r'\['
 423         t.lexer.brack_count += 1
 424         return t
 425
 426     def t_RBRACK(self, t):
 427         r'\]'
 428         # check for underflow?  should be the job of the parser
 429         t.lexer.brack_count -= 1
 430         return t
 431
 432     def t_LPAR(self, t):
 433         r'\('
 434         t.lexer.paren_count += 1
 435         return t
 436
 437     def t_RPAR(self, t):
 438         r'\)'
 439         # check for underflow?  should be the job of the parser
 440         t.lexer.paren_count -= 1
 441         return t
 442
 443     #t_ignore = " "
 444
 445     def t_error(self, t):
 446         raise SyntaxError("Unknown symbol %r" % (t.value[0],))
 447         print ("Skipping", repr(t.value[0]))
 448         t.lexer.skip(1)
 449
 450
 451 # Combine Ply and my filters into a new lexer
 452
 453 class IndentLexer(PowerLexer):
 454     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 455         self.debug = debug
 456         self.build(debug=debug, optimize=optimize,
 457                                 lextab=lextab, reflags=reflags)
 458         self.token_stream = None
 459
 460     def input(self, s, add_endmarker=True):
 461         s = annoying_case_hack_filter(s)
 462         if self.debug:
 463             print (s)
 464         s += "\n"
 465         self.lexer.paren_count = 0
 466         self.lexer.brack_count = 0
 467         self.lexer.input(s)
 468         self.token_stream = filter(self.lexer, add_endmarker)
 469
 470     def token(self):
 471         try:
 472             return next(self.token_stream)
 473         except StopIteration:
 474             return None
 475
 476 switchtest = """
 477 switch (n)
 478     case(1): x <- 5
 479     case(3): x <- 2
 480     case(2):
 481
 482     case(4):
 483         x <- 3
 484     case(9):
 485
 486     default:
 487         x <- 9
 488 print (5)
 489 """
 490
 491 cnttzd = """
 492 n  <- 0
 493 do while n < 64
 494    if (RS)[63-n] = 0b1 then
 495         leave
 496    n  <- n + 1
 497 RA <- EXTZ64(n)
 498 print (RA)
 499 """
 500
 501 if __name__ == '__main__':
 502
 503     # quick test/demo
 504     #code = cnttzd
 505     code = switchtest
 506     print (code)
 507
 508     lexer = IndentLexer(debug=1)
 509     # Give the lexer some input
 510     print ("code")
 511     print (code)
 512     lexer.input(code)
 513
 514     tokens = iter(lexer.token, None)
 515     for token in tokens:
 516         print (token)
 517