src/soc/decoder/pseudo/lexer.py

   1 # Based on GardenSnake - a parser generator demonstration program
   2 # GardenSnake was released into the Public Domain by Andrew Dalke.
   3
   4 # Portions of this work are derived from Python's Grammar definition
   5 # and may be covered under the Python copyright and license
   6 #
   7 #          Andrew Dalke / Dalke Scientific Software, LLC
   8 #             30 August 2006 / Cape Town, South Africa
   9
  10 # Modifications for inclusion in PLY distribution
  11 from copy import copy
  12 from ply import lex
  13 from soc.decoder.selectable_int import SelectableInt
  14
  15 ## I implemented INDENT / DEDENT generation as a post-processing filter
  16
  17 # The original lex token stream contains WS and NEWLINE characters.
  18 # WS will only occur before any other tokens on a line.
  19
  20 # I have three filters.  One tags tokens by adding two attributes.
  21 # "must_indent" is True if the token must be indented from the
  22 # previous code.  The other is "at_line_start" which is True for WS
  23 # and the first non-WS/non-NEWLINE on a line.  It flags the check so
  24 # see if the new line has changed indication level.
  25
  26 # Python's syntax has three INDENT states
  27 #  0) no colon hence no need to indent
  28 #  1) "if 1: go()" - simple statements have a COLON but no need for an indent
  29 #  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
  30 NO_INDENT = 0
  31 MAY_INDENT = 1
  32 MUST_INDENT = 2
  33
  34 # turn into python-like colon syntax from pseudo-code syntax
  35 def python_colonify(lexer, tokens):
  36
  37     forwhile_seen = False
  38     for token in tokens:
  39         #print ("track colon token", token, token.type)
  40
  41         if token.type == 'THEN':
  42             # turn then into colon
  43             token.type = "COLON"
  44             yield token
  45         elif token.type == 'ELSE':
  46             yield token
  47             token = copy(token)
  48             token.type = "COLON"
  49             yield token
  50         elif token.type in ['DO', 'WHILE', 'FOR']:
  51             forwhile_seen = True
  52             yield token
  53         elif token.type == 'NEWLINE':
  54             if forwhile_seen:
  55                 ctok = copy(token)
  56                 ctok.type = "COLON"
  57                 yield ctok
  58                 forwhile_seen = False
  59             yield token
  60         else:
  61             yield token
  62
  63
  64 # only care about whitespace at the start of a line
  65 def track_tokens_filter(lexer, tokens):
  66     oldignore = lexer.lexignore
  67     lexer.at_line_start = at_line_start = True
  68     indent = NO_INDENT
  69     saw_colon = False
  70     for token in tokens:
  71         #print ("track token", token, token.type)
  72         token.at_line_start = at_line_start
  73
  74         if token.type == "COLON":
  75             at_line_start = False
  76             indent = MAY_INDENT
  77             token.must_indent = False
  78
  79         elif token.type == "NEWLINE":
  80             at_line_start = True
  81             if indent == MAY_INDENT:
  82                 indent = MUST_INDENT
  83             token.must_indent = False
  84
  85         elif token.type == "WS":
  86             assert token.at_line_start == True
  87             at_line_start = True
  88             token.must_indent = False
  89
  90         else:
  91             # A real token; only indent after COLON NEWLINE
  92             if indent == MUST_INDENT:
  93                 token.must_indent = True
  94             else:
  95                 token.must_indent = False
  96             at_line_start = False
  97             indent = NO_INDENT
  98
  99         # really bad hack that changes ignore lexer state.
 100         # when "must indent" is seen (basically "real tokens" seen)
 101         # then ignore whitespace.
 102         if token.must_indent:
 103             lexer.lexignore = ('ignore', ' ')
 104         else:
 105             lexer.lexignore = oldignore
 106
 107         token.indent = indent
 108         yield token
 109         lexer.at_line_start = at_line_start
 110
 111 def _new_token(type, lineno):
 112     tok = lex.LexToken()
 113     tok.type = type
 114     tok.value = None
 115     tok.lineno = lineno
 116     tok.lexpos = -1
 117     return tok
 118
 119 # Synthesize a DEDENT tag
 120 def DEDENT(lineno):
 121     return _new_token("DEDENT", lineno)
 122
 123 # Synthesize an INDENT tag
 124 def INDENT(lineno):
 125     return _new_token("INDENT", lineno)
 126
 127
 128 # Track the indentation level and emit the right INDENT / DEDENT events.
 129 def indentation_filter(tokens):
 130     # A stack of indentation levels; will never pop item 0
 131     levels = [0]
 132     token = None
 133     depth = 0
 134     prev_was_ws = False
 135     for token in tokens:
 136         if 0:
 137             print ("Process", depth, token.indent, token,)
 138             if token.at_line_start:
 139                 print ("at_line_start",)
 140             if token.must_indent:
 141                 print ("must_indent",)
 142             print
 143
 144         # WS only occurs at the start of the line
 145         # There may be WS followed by NEWLINE so
 146         # only track the depth here.  Don't indent/dedent
 147         # until there's something real.
 148         if token.type == "WS":
 149             assert depth == 0
 150             depth = len(token.value)
 151             prev_was_ws = True
 152             # WS tokens are never passed to the parser
 153             continue
 154
 155         if token.type == "NEWLINE":
 156             depth = 0
 157             if prev_was_ws or token.at_line_start:
 158                 # ignore blank lines
 159                 continue
 160             # pass the other cases on through
 161             yield token
 162             continue
 163
 164         # then it must be a real token (not WS, not NEWLINE)
 165         # which can affect the indentation level
 166
 167         prev_was_ws = False
 168         if token.must_indent:
 169             # The current depth must be larger than the previous level
 170             if not (depth > levels[-1]):
 171                 raise IndentationError("expected an indented block")
 172
 173             levels.append(depth)
 174             yield INDENT(token.lineno)
 175
 176         elif token.at_line_start:
 177             # Must be on the same level or one of the previous levels
 178             if depth == levels[-1]:
 179                 # At the same level
 180                 pass
 181             elif depth > levels[-1]:
 182                 raise IndentationError("indent increase but not in new block")
 183             else:
 184                 # Back up; but only if it matches a previous level
 185                 try:
 186                     i = levels.index(depth)
 187                 except ValueError:
 188                     raise IndentationError("inconsistent indentation")
 189                 for _ in range(i+1, len(levels)):
 190                     yield DEDENT(token.lineno)
 191                     levels.pop()
 192
 193         yield token
 194
 195     ### Finished processing ###
 196
 197     # Must dedent any remaining levels
 198     if len(levels) > 1:
 199         assert token is not None
 200         for _ in range(1, len(levels)):
 201             yield DEDENT(token.lineno)
 202
 203
 204 # The top-level filter adds an ENDMARKER, if requested.
 205 # Python's grammar uses it.
 206 def filter(lexer, add_endmarker = True):
 207     token = None
 208     tokens = iter(lexer.token, None)
 209     tokens = python_colonify(lexer, tokens)
 210     tokens = track_tokens_filter(lexer, tokens)
 211     for token in indentation_filter(tokens):
 212         yield token
 213
 214     if add_endmarker:
 215         lineno = 1
 216         if token is not None:
 217             lineno = token.lineno
 218         yield _new_token("ENDMARKER", lineno)
 219
 220 ##### Lexer ######
 221
 222 class PowerLexer:
 223     tokens = (
 224         'DEF',
 225         'IF',
 226         'THEN',
 227         'ELSE',
 228         'FOR',
 229         'TO',
 230         'DO',
 231         'WHILE',
 232         'BREAK',
 233         'NAME',
 234         'NUMBER',  # Python decimals
 235         'BINARY',  # Python binary
 236         'STRING',  # single quoted strings only; syntax of raw strings
 237         'LPAR',
 238         'RPAR',
 239         'LBRACK',
 240         'RBRACK',
 241         'COLON',
 242         'EQ',
 243         'ASSIGNEA',
 244         'ASSIGN',
 245         'LTU',
 246         'GTU',
 247         'LE',
 248         'GE',
 249         'LT',
 250         'GT',
 251         'PLUS',
 252         'MINUS',
 253         'MULT',
 254         'DIV',
 255         'MOD',
 256         'INVERT',
 257         'APPEND',
 258         'BITOR',
 259         'BITAND',
 260         'BITXOR',
 261         'RETURN',
 262         'WS',
 263         'NEWLINE',
 264         'COMMA',
 265         'SEMICOLON',
 266         'INDENT',
 267         'DEDENT',
 268         'ENDMARKER',
 269         )
 270
 271     # Build the lexer
 272     def build(self,**kwargs):
 273          self.lexer = lex.lex(module=self, **kwargs)
 274
 275     def t_BINARY(self, t):
 276         r"""0b[01]+"""
 277         t.value = SelectableInt(int(t.value, 2), len(t.value)-2)
 278         return t
 279
 280     #t_NUMBER = r'\d+'
 281     # taken from decmial.py but without the leading sign
 282     def t_NUMBER(self, t):
 283         r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
 284         t.value = int(t.value)
 285         return t
 286
 287     def t_STRING(self, t):
 288         r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
 289         print (repr(t.value))
 290         t.value=t.value[1:-1]
 291         return t
 292
 293     t_COLON = r':'
 294     t_EQ = r'='
 295     t_ASSIGNEA = r'<-iea'
 296     t_ASSIGN = r'<-'
 297     t_LTU = r'<u'
 298     t_GTU = r'>u'
 299     t_LE = r'<='
 300     t_GE = r'>='
 301     t_LT = r'<'
 302     t_GT = r'>'
 303     t_PLUS = r'\+'
 304     t_MINUS = r'-'
 305     t_MULT = r'\*'
 306     t_DIV = r'/'
 307     t_MOD = r'%'
 308     t_INVERT = r'¬'
 309     t_COMMA = r','
 310     t_SEMICOLON = r';'
 311     t_APPEND = r'\|\|'
 312     t_BITOR = r'\|'
 313     t_BITAND = r'\&'
 314     t_BITXOR = r'\^'
 315
 316     # Ply nicely documented how to do this.
 317
 318     RESERVED = {
 319       "def": "DEF",
 320       "if": "IF",
 321       "then": "THEN",
 322       "else": "ELSE",
 323       "leave": "BREAK",
 324       "for": "FOR",
 325       "to": "TO",
 326       "while": "WHILE",
 327       "do": "DO",
 328       "return": "RETURN",
 329       }
 330
 331     def t_NAME(self, t):
 332         r'[a-zA-Z_][a-zA-Z0-9_]*'
 333         t.type = self.RESERVED.get(t.value, "NAME")
 334         return t
 335
 336     # Putting this before t_WS let it consume lines with only comments in
 337     # them so the latter code never sees the WS part.  Not consuming the
 338     # newline.  Needed for "if 1: #comment"
 339     def t_comment(self, t):
 340         r"[ ]*\043[^\n]*"  # \043 is '#'
 341         pass
 342
 343
 344     # Whitespace
 345     def t_WS(self, t):
 346         r'[ ]+'
 347         if t.lexer.at_line_start and t.lexer.paren_count == 0 and \
 348                                      t.lexer.brack_count == 0:
 349             return t
 350
 351     # Don't generate newline tokens when inside of parenthesis, eg
 352     #   a = (1,
 353     #        2, 3)
 354     def t_newline(self, t):
 355         r'\n+'
 356         t.lexer.lineno += len(t.value)
 357         t.type = "NEWLINE"
 358         if t.lexer.paren_count == 0 and t.lexer.brack_count == 0:
 359             return t
 360
 361     def t_LBRACK(self, t):
 362         r'\['
 363         t.lexer.brack_count += 1
 364         return t
 365
 366     def t_RBRACK(self, t):
 367         r'\]'
 368         # check for underflow?  should be the job of the parser
 369         t.lexer.brack_count -= 1
 370         return t
 371
 372     def t_LPAR(self, t):
 373         r'\('
 374         t.lexer.paren_count += 1
 375         return t
 376
 377     def t_RPAR(self, t):
 378         r'\)'
 379         # check for underflow?  should be the job of the parser
 380         t.lexer.paren_count -= 1
 381         return t
 382
 383     #t_ignore = " "
 384
 385     def t_error(self, t):
 386         raise SyntaxError("Unknown symbol %r" % (t.value[0],))
 387         print ("Skipping", repr(t.value[0]))
 388         t.lexer.skip(1)
 389
 390
 391 # Combine Ply and my filters into a new lexer
 392
 393 class IndentLexer(PowerLexer):
 394     def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
 395         self.build(debug=debug, optimize=optimize,
 396                                 lextab=lextab, reflags=reflags)
 397         self.token_stream = None
 398
 399     def input(self, s, add_endmarker=True):
 400         self.lexer.paren_count = 0
 401         self.lexer.brack_count = 0
 402         self.lexer.input(s)
 403         self.token_stream = filter(self.lexer, add_endmarker)
 404
 405     def token(self):
 406         try:
 407             return next(self.token_stream)
 408         except StopIteration:
 409             return None
 410
 411 if __name__ == '__main__':
 412
 413     # quick test/demo
 414     cnttzd = """
 415     n  <- 0
 416     do while n < 64
 417        if (RS)[63-n] = 0b1 then
 418             leave
 419        n  <- n + 1
 420     RA <- EXTZ64(n)
 421     print (RA)
 422     """
 423
 424     code = cnttzd
 425
 426     lexer = IndentLexer(debug=1)
 427     # Give the lexer some input
 428     print ("code")
 429     print (code)
 430     lexer.input(code)
 431