src/soc/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from soc.decoder.selectable_int import SelectableInt
  14
  15 ## I implemented INDENT / DEDENT generation as a post-processing filter
  16
  17 # The original lex token stream contains WS and NEWLINE characters.
  18 # WS will only occur before any other tokens on a line.
  19
  20 # I have three filters.  One tags tokens by adding two attributes.
  21 # "must_indent" is True if the token must be indented from the
  22 # previous code.  The other is "at_line_start" which is True for WS
  23 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  24 # see if the new line has changed indication level.
  25
  26 # Python's syntax has three INDENT states
  27 #  0) no colon hence no need to indent
  28 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  29 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  30 NO_INDENT = 0
  31 MAY_INDENT = 1
  32 MUST_INDENT = 2
  33
  34 # turn into python-like colon syntax from pseudo-code syntax.
  35 # identify tokens which tell us whether a "hidden colon" is needed.
  36 # this in turn means that track_tokens_filter "works" without needing
  37 # complex grammar rules
  38 def python_colonify(lexer, tokens):
  39
  40     implied_colon_needed = False
  41     for token in tokens:
  42         #print ("track colon token", token, token.type)
  43
  44         if token.type == 'THEN':
  45             # turn then into colon
  46             token.type = "COLON"
  47             yield token
  48         elif token.type == 'ELSE':
  49             yield token
  50             token = copy(token)
  51             token.type = "COLON"
  52             yield token
  53         elif token.type in ['DO', 'WHILE', 'FOR', 'SWITCH']:
  54             implied_colon_needed = True
  55             yield token
  56         elif token.type == 'NEWLINE':
  57             if implied_colon_needed:
  58                 ctok = copy(token)
  59                 ctok.type = "COLON"
  60                 yield ctok
  61                 implied_colon_needed = False
  62             yield token
  63         else:
  64             yield token
  65
  66
  67 # only care about whitespace at the start of a line
  68 def track_tokens_filter(lexer, tokens):
  69     oldignore = lexer.lexignore
  70     lexer.at_line_start = at_line_start = True
  71     indent = NO_INDENT
  72     saw_colon = False
  73     for token in tokens:
  74         #print ("track token", token, token.type)
  75         token.at_line_start = at_line_start
  76
  77         if token.type == "COLON":
  78             at_line_start = False
  79             indent = MAY_INDENT
  80             token.must_indent = False
  81
  82         elif token.type == "NEWLINE":
  83             at_line_start = True
  84             if indent == MAY_INDENT:
  85                 indent = MUST_INDENT
  86             token.must_indent = False
  87
  88         elif token.type == "WS":
  89             assert token.at_line_start == True
  90             at_line_start = True
  91             token.must_indent = False
  92
  93         else:
  94             # A real token; only indent after COLON NEWLINE
  95             if indent == MUST_INDENT:
  96                 token.must_indent = True
  97             else:
  98                 token.must_indent = False
  99             at_line_start = False
 100             indent = NO_INDENT
 101
 102         # really bad hack that changes ignore lexer state.
 103         # when "must indent" is seen (basically "real tokens" seen)
 104         # then ignore whitespace.
 105         if token.must_indent:
 106             lexer.lexignore = ('ignore', ' ')
 107         else:
 108             lexer.lexignore = oldignore
 109
 110         token.indent = indent
 111         yield token
 112         lexer.at_line_start = at_line_start
 113
 114 def _new_token(type, lineno):
 115     tok = lex.LexToken()
 116     tok.type = type
 117     tok.value = None
 118     tok.lineno = lineno
 119     tok.lexpos = -1
 120     return tok
 121
 122 # Synthesize a DEDENT tag
 123 def DEDENT(lineno):
 124     return _new_token("DEDENT", lineno)
 125
 126 # Synthesize an INDENT tag
 127 def INDENT(lineno):
 128     return _new_token("INDENT", lineno)
 129
 130 def count_spaces(l):
 131     for i in range(len(l)):
 132         if l[i] != ' ':
 133             return i
 134     return 0
 135
 136 def annoying_case_hack_filter(code):
 137     """add annoying "silent keyword" (fallthrough)
 138
 139     this which tricks the parser into taking the (silent) case statement
 140     as a "small expression".  it can then be spotted and used to indicate
 141     "fall through" to the next case (in the parser)
 142
 143     bugs: any function that starts with the letters "case" or "default"
 144     will be detected erroneously.  fixing that involves doing a token
 145     lexer which spots the fact that "case" and "default" are words,
 146     separating them from space, colon, bracket etc.
 147
 148     http://bugs.libre-riscv.org/show_bug.cgi?id=280
 149     """
 150     res = []
 151     prev_spc_count = None
 152     for l in code.split("\n"):
 153         spc_count = count_spaces(l)
 154         nwhite = l[spc_count:]
 155         if nwhite.startswith("case") or nwhite.startswith("default"):
 156             #print ("case/default", nwhite, spc_count, prev_spc_count)
 157             if (prev_spc_count is not None and
 158                 prev_spc_count == spc_count and
 159                 (res[-1].endswith(":") or res[-1].endswith(": fallthrough"))):
 160                 res[-1] += " fallthrough" # add to previous line
 161             prev_spc_count = spc_count
 162         else:
 163             #print ("notstarts", spc_count, nwhite)
 164             prev_spc_count = None
 165         res.append(l)
 166     return '\n'.join(res)
 167
 168
 169 # Track the indentation level and emit the right INDENT / DEDENT events.
 170 def indentation_filter(tokens):
 171     # A stack of indentation levels; will never pop item 0
 172     levels = [0]
 173     token = None
 174     depth = 0
 175     prev_was_ws = False
 176     for token in tokens:
 177         if 0:
 178             print ("Process", depth, token.indent, token,)
 179             if token.at_line_start:
 180                 print ("at_line_start",)
 181             if token.must_indent:
 182                 print ("must_indent",)
 183             print
 184
 185         # WS only occurs at the start of the line
 186         # There may be WS followed by NEWLINE so
 187         # only track the depth here.  Don't indent/dedent
 188         # until there's something real.
 189         if token.type == "WS":
 190             assert depth == 0
 191             depth = len(token.value)
 192             prev_was_ws = True
 193             # WS tokens are never passed to the parser
 194             continue
 195
 196         if token.type == "NEWLINE":
 197             depth = 0
 198             if prev_was_ws or token.at_line_start:
 199                 # ignore blank lines
 200                 continue
 201             # pass the other cases on through
 202             yield token
 203             continue
 204
 205         # then it must be a real token (not WS, not NEWLINE)
 206         # which can affect the indentation level
 207
 208         prev_was_ws = False
 209         if token.must_indent:
 210             # The current depth must be larger than the previous level
 211             if not (depth > levels[-1]):
 212                 raise IndentationError("expected an indented block")
 213
 214             levels.append(depth)
 215             yield INDENT(token.lineno)
 216
 217         elif token.at_line_start:
 218             # Must be on the same level or one of the previous levels
 219             if depth == levels[-1]:
 220                 # At the same level
 221                 pass
 222             elif depth > levels[-1]:
 223                 raise IndentationError("indent increase but not in new block")
 224             else:
 225                 # Back up; but only if it matches a previous level
 226                 try:
 227                     i = levels.index(depth)
 228                 except ValueError:
 229                     raise IndentationError("inconsistent indentation")
 230                 for _ in range(i+1, len(levels)):
 231                     yield DEDENT(token.lineno)
 232                     levels.pop()
 233
 234         yield token
 235
 236     ### Finished processing ###
 237
 238     # Must dedent any remaining levels
 239     if len(levels) > 1:
 240         assert token is not None
 241         for _ in range(1, len(levels)):
 242             yield DEDENT(token.lineno)
 243
 244
 245 # The top-level filter adds an ENDMARKER, if requested.
 246 # Python's grammar uses it.
 247 def filter(lexer, add_endmarker = True):
 248     token = None
 249     tokens = iter(lexer.token, None)
 250     tokens = python_colonify(lexer, tokens)
 251     tokens = track_tokens_filter(lexer, tokens)
 252     for token in indentation_filter(tokens):
 253         yield token
 254
 255     if add_endmarker:
 256         lineno = 1
 257         if token is not None:
 258             lineno = token.lineno
 259         yield _new_token("ENDMARKER", lineno)
 260
 261 ##### Lexer ######
 262
 263 class PowerLexer:
 264     tokens = (
 265         'DEF',
 266         'IF',
 267         'THEN',
 268         'ELSE',
 269         'FOR',
 270         'TO',
 271         'DO',
 272         'WHILE',
 273         'BREAK',
 274         'NAME',
 275         'HEX',     # hex numbers
 276         'NUMBER',  # Python decimals
 277         'BINARY',  # Python binary
 278         'STRING',  # single quoted strings only; syntax of raw strings
 279         'LPAR',
 280         'RPAR',
 281         'LBRACK',
 282         'RBRACK',
 283         'COLON',
 284         'EQ',
 285         'ASSIGNEA',
 286         'ASSIGN',
 287         'LTU',
 288         'GTU',
 289         'NE',
 290         'LE',
 291         'GE',
 292         'LT',
 293         'GT',
 294         'PLUS',
 295         'MINUS',
 296         'MULT',
 297         'DIV',
 298         'MOD',
 299         'INVERT',
 300         'APPEND',
 301         'BITOR',
 302         'BITAND',
 303         'BITXOR',
 304         'RETURN',
 305         'SWITCH',
 306         'CASE',
 307         'DEFAULT',
 308         'WS',
 309         'NEWLINE',
 310         'COMMA',
 311         'SEMICOLON',
 312         'INDENT',
 313         'DEDENT',
 314         'ENDMARKER',
 315         )
 316
 317     # Build the lexer
 318     def build(self,**kwargs):
 319          self.lexer = lex.lex(module=self, **kwargs)
 320
 321     def t_HEX(self, t):
 322         r"""0x[0-9a-fA-F_]+"""
 323         val = t.value.replace("_", "")
 324         t.value = SelectableInt(int(val, 16), (len(val)-2)*16)
 325         return t
 326
 327     def t_BINARY(self, t):
 328         r"""0b[01]+"""
 329         t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
 330         return t
 331
 332     #t_NUMBER = r'\d+'
 333     # taken from decmial.py but without the leading sign
 334     def t_NUMBER(self, t):
 335         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 336         t.value = int(t.value)
 337         return t
 338
 339     def t_STRING(self, t):
 340         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 341         print (repr(t.value))
 342         t.value=t.value[1:-1]
 343         return t
 344
 345     t_COLON = r':'
 346     t_EQ = r'='
 347     t_ASSIGNEA = r'<-iea'
 348     t_ASSIGN = r'<-'
 349     t_LTU = r'<u'
 350     t_GTU = r'>u'
 351     t_NE = r'!='
 352     t_LE = r'<='
 353     t_GE = r'>='
 354     t_LT = r'<'
 355     t_GT = r'>'
 356     t_PLUS = r'\+'
 357     t_MINUS = r'-'
 358     t_MULT = r'\*'
 359     t_DIV = r'/'
 360     t_MOD = r'%'
 361     t_INVERT = r'¬'
 362     t_COMMA = r','
 363     t_SEMICOLON = r';'
 364     t_APPEND = r'\|\|'
 365     t_BITOR = r'\|'
 366     t_BITAND = r'\&'
 367     t_BITXOR = r'\^'
 368
 369     # Ply nicely documented how to do this.
 370
 371     RESERVED = {
 372       "def": "DEF",
 373       "if": "IF",
 374       "then": "THEN",
 375       "else": "ELSE",
 376       "leave": "BREAK",
 377       "for": "FOR",
 378       "to": "TO",
 379       "while": "WHILE",
 380       "do": "DO",
 381       "return": "RETURN",
 382       "switch": "SWITCH",
 383       "case": "CASE",
 384       "default": "DEFAULT",
 385       }
 386
 387     def t_NAME(self, t):
 388         r'[a-zA-Z_][a-zA-Z0-9_]*'
 389         t.type = self.RESERVED.get(t.value, "NAME")
 390         return t
 391
 392     # Putting this before t_WS let it consume lines with only comments in
 393     # them so the latter code never sees the WS part.  Not consuming the
 394     # newline.  Needed for "if 1: #comment"
 395     def t_comment(self, t):
 396         r"[ ]*\043[^\n]*"  # \043 is '#'
 397         pass
 398
 399
 400     # Whitespace
 401     def t_WS(self, t):
 402         r'[ ]+'
 403         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 404                                      t.lexer.brack_count == 0:
 405             return t
 406
 407     # Don't generate newline tokens when inside of parenthesis, eg
 408     #   a = (1,
 409     #        2, 3)
 410     def t_newline(self, t):
 411         r'\n+'
 412         t.lexer.lineno += len(t.value)
 413         t.type = "NEWLINE"
 414         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 415             return t
 416
 417     def t_LBRACK(self, t):
 418         r'\['
 419         t.lexer.brack_count += 1
 420         return t
 421
 422     def t_RBRACK(self, t):
 423         r'\]'
 424         # check for underflow?  should be the job of the parser
 425         t.lexer.brack_count -= 1
 426         return t
 427
 428     def t_LPAR(self, t):
 429         r'\('
 430         t.lexer.paren_count += 1
 431         return t
 432
 433     def t_RPAR(self, t):
 434         r'\)'
 435         # check for underflow?  should be the job of the parser
 436         t.lexer.paren_count -= 1
 437         return t
 438
 439     #t_ignore = " "
 440
 441     def t_error(self, t):
 442         raise SyntaxError("Unknown symbol %r" % (t.value[0],))
 443         print ("Skipping", repr(t.value[0]))
 444         t.lexer.skip(1)
 445
 446
 447 # Combine Ply and my filters into a new lexer
 448
 449 class IndentLexer(PowerLexer):
 450     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 451         self.debug = debug
 452         self.build(debug=debug, optimize=optimize,
 453                                 lextab=lextab, reflags=reflags)
 454         self.token_stream = None
 455
 456     def input(self, s, add_endmarker=True):
 457         s = annoying_case_hack_filter(s)
 458         if self.debug:
 459             print (s)
 460         self.lexer.paren_count = 0
 461         self.lexer.brack_count = 0
 462         self.lexer.input(s)
 463         self.token_stream = filter(self.lexer, add_endmarker)
 464
 465     def token(self):
 466         try:
 467             return next(self.token_stream)
 468         except StopIteration:
 469             return None
 470
 471 switchtest = """
 472 switch (n)
 473     case(1): x <- 5
 474     case(3): x <- 2
 475     case(2):
 476     case(4):
 477         x <- 3
 478     case(9):
 479     default:
 480         x <- 9
 481 print (5)
 482 """
 483
 484 cnttzd = """
 485 n  <- 0
 486 do while n < 64
 487    if (RS)[63-n] = 0b1 then
 488         leave
 489    n  <- n + 1
 490 RA <- EXTZ64(n)
 491 print (RA)
 492 """
 493
 494 if __name__ == '__main__':
 495
 496     # quick test/demo
 497     #code = cnttzd
 498     code = switchtest
 499     print (code)
 500
 501     lexer = IndentLexer(debug=1)
 502     # Give the lexer some input
 503     print ("code")
 504     print (code)
 505     lexer.input(code)
 506
 507     tokens = iter(lexer.token, None)
 508     for token in tokens:
 509         print (token)
 510