1 # -----------------------------------------------------------------------------
4 # Author: David Beazley (http://www.dabeaz.com)
8 # This module implements an ANSI-C style lexical preprocessor for PLY.
9 # -----------------------------------------------------------------------------
10 from __future__
import generators
12 # -----------------------------------------------------------------------------
13 # Default preprocessor lexer definitions. These tokens are enough to get
14 # a basic preprocessor working. Other modules may import these if they want
15 # -----------------------------------------------------------------------------
18 'CPP_ID','CPP_INTEGER', 'CPP_FLOAT', 'CPP_STRING', 'CPP_CHAR', 'CPP_WS', 'CPP_COMMENT', 'CPP_POUND','CPP_DPOUND'
21 literals
= "+-*/%|&~^<>=!?()[]{}.,;:\\\'\""
26 t
.lexer
.lineno
+= t
.value
.count("\n")
30 t_CPP_DPOUND
= r
'\#\#'
33 t_CPP_ID
= r
'[A-Za-z_][\w_]*'
37 r
'(((((0x)|(0X))[0-9a-fA-F]+)|(\d+))([uU]|[lL]|[uU][lL]|[lL][uU])?)'
40 t_CPP_INTEGER
= CPP_INTEGER
43 t_CPP_FLOAT
= r
'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
47 r
'\"([^\\\n]|(\\(.|\n)))*?\"'
48 t
.lexer
.lineno
+= t
.value
.count("\n")
51 # Character constant 'c' or L'c'
53 r
'(L)?\'([^
\\\n]|
(\\(.|
\n)))*?
\''
54 t.lexer.lineno += t.value.count("\n")
59 r'(/\
*(.|
\n)*?\
*/)|
(//.*?
\n)'
60 t.lexer.lineno += t.value.count("\n")
74 # -----------------------------------------------------------------------------
77 # Given an input string, this function replaces all trigraph sequences.
78 # The following mapping is used:
89 # -----------------------------------------------------------------------------
91 _trigraph_pat
= re
.compile(r
'''\?\?[=/\'\
(\
)\
!<>\
-]''')
105 return _trigraph_pat.sub(lambda g: _trigraph_rep[g.group()[-1]],input)
107 # ------------------------------------------------------------------
110 # This object holds information about preprocessor macros
112 # .name - Macro name (string)
113 # .value - Macro value (a list of tokens)
114 # .arglist - List of argument names
115 # .variadic - Boolean indicating whether or not variadic macro
116 # .vararg - Name of the variadic parameter
118 # When a macro is created, the macro replacement token sequence is
119 # pre-scanned and used to create patch lists that are later used
120 # during macro expansion
121 # ------------------------------------------------------------------
124 def __init__(self,name,value,arglist=None,variadic=False):
127 self.arglist = arglist
128 self.variadic = variadic
130 self.vararg = arglist[-1]
133 # ------------------------------------------------------------------
134 # Preprocessor object
136 # Object representing a preprocessor. Contains macro definitions,
137 # include directories, and other information
138 # ------------------------------------------------------------------
140 class Preprocessor(object):
141 def __init__(self,lexer=None):
149 # Probe the lexer for selected tokens
152 tm = time.localtime()
153 self.define("__DATE__ \"%s\"" % time.strftime("%b %d %Y",tm))
154 self.define("__TIME__ \"%s\"" % time.strftime("%H:%M:%S",tm))
157 # -----------------------------------------------------------------------------
160 # Utility function. Given a string of text, tokenize into a list of tokens
161 # -----------------------------------------------------------------------------
163 def tokenize(self,text):
165 self.lexer.input(text)
167 tok = self.lexer.token()
172 # ---------------------------------------------------------------------
175 # Report a preprocessor error/warning of some kind
176 # ----------------------------------------------------------------------
178 def error(self,file,line,msg):
179 print >>sys.stderr,"%s:%d %s" % (file,line,msg)
181 # ----------------------------------------------------------------------
184 # This method probes the preprocessor lexer object to discover
185 # the token types of symbols that are important to the preprocessor.
186 # If this works right, the preprocessor will simply "work"
187 # with any suitable lexer regardless of how tokens have been named.
188 # ----------------------------------------------------------------------
192 # Determine the token type for identifiers
193 self.lexer.input("identifier")
194 tok = self.lexer.token()
195 if not tok or tok.value != "identifier":
196 print "Couldn't determine identifier type"
200 # Determine the token type for integers
201 self.lexer.input("12345")
202 tok = self.lexer.token()
203 if not tok or int(tok.value) != 12345:
204 print "Couldn't determine integer type"
206 self.t_INTEGER = tok.type
207 self.t_INTEGER_TYPE = type(tok.value)
209 # Determine the token type for strings enclosed in double quotes
210 self.lexer.input("\"filename\"")
211 tok = self.lexer.token()
212 if not tok or tok.value != "\"filename\"":
213 print "Couldn't determine string type"
215 self.t_STRING = tok.type
217 # Determine the token type for whitespace--if any
218 self.lexer.input(" ")
219 tok = self.lexer.token()
220 if not tok or tok.value != " ":
223 self.t_SPACE = tok.type
225 # Determine the token type for newlines
226 self.lexer.input("\n")
227 tok = self.lexer.token()
228 if not tok or tok.value != "\n":
229 self.t_NEWLINE = None
230 print "Couldn't determine token for newlines"
232 self.t_NEWLINE = tok.type
234 self.t_WS = (self.t_SPACE, self.t_NEWLINE)
236 # Check for other characters used by the preprocessor
237 chars = [ '<','>','#','##','\\','(',')',',','.']
240 tok = self.lexer.token()
241 if not tok or tok.value != c:
242 print "Unable to lex '%s' required for preprocessor" % c
244 # ----------------------------------------------------------------------
247 # Adds a search path to the preprocessor.
248 # ----------------------------------------------------------------------
250 def add_path(self,path):
251 self.path.append(path)
253 # ----------------------------------------------------------------------
256 # Given an input string, this function splits it into lines. Trailing whitespace
257 # is removed. Any line ending with \ is grouped with the next line. This
258 # function forms the lowest level of the preprocessor---grouping into text into
259 # a line-by-line format.
260 # ----------------------------------------------------------------------
262 def group_lines(self,input):
263 lex = self.lexer.clone()
264 lines = [x.rstrip() for x in input.splitlines()]
265 for i in xrange(len(lines)):
267 while lines[i].endswith('\\') and (j < len(lines)):
268 lines[i] = lines[i][:-1]+lines[j]
272 input = "\n".join(lines)
281 current_line.append(tok)
282 if tok.type in self.t_WS and '\n' in tok.value:
289 # ----------------------------------------------------------------------
292 # Remove leading/trailing whitespace tokens from a token list
293 # ----------------------------------------------------------------------
295 def tokenstrip(self,tokens):
297 while i < len(tokens) and tokens[i].type in self.t_WS:
301 while i >= 0 and tokens[i].type in self.t_WS:
307 # ----------------------------------------------------------------------
310 # Collects comma separated arguments from a list of tokens. The arguments
311 # must be enclosed in parenthesis. Returns a tuple (tokencount,args,positions)
312 # where tokencount is the number of tokens consumed, args is a list of arguments,
313 # and positions is a list of integers containing the starting index of each
314 # argument. Each argument is represented by a list of tokens.
316 # When collecting arguments, leading and trailing whitespace is removed
317 # from each argument.
319 # This function properly handles nested parenthesis and commas---these do not
320 # define new arguments.
321 # ----------------------------------------------------------------------
323 def collect_args(self,tokenlist):
328 tokenlen = len(tokenlist)
330 # Search for the opening '('.
332 while (i < tokenlen) and (tokenlist[i].type in self.t_WS):
335 if (i < tokenlen) and (tokenlist[i].value == '('):
336 positions.append(i+1)
338 self.error(self.source,tokenlist[0].lineno,"Missing '(' in macro arguments")
346 current_arg.append(t)
352 args.append(self.tokenstrip(current_arg))
354 return i+1,args,positions
355 current_arg.append(t)
356 elif t.value == ',' and nesting == 1:
357 args.append(self.tokenstrip(current_arg))
358 positions.append(i+1)
361 current_arg.append(t)
364 # Missing end argument
365 self.error(self.source,tokenlist[-1].lineno,"Missing ')' in macro arguments")
368 # ----------------------------------------------------------------------
371 # Examine the macro value (token sequence) and identify patch points
372 # This is used to speed up macro expansion later on---we'll know
373 # right away where to apply patches to the value to form the expansion
374 # ----------------------------------------------------------------------
376 def macro_prescan(self,macro):
377 macro.patch = [] # Standard macro arguments
378 macro.str_patch = [] # String conversion expansion
379 macro.var_comma_patch = [] # Variadic macro comma patch
381 while i < len(macro.value):
382 if macro.value[i].type == self.t_ID and macro.value[i].value in macro.arglist:
383 argnum = macro.arglist.index(macro.value[i].value)
384 # Conversion of argument to a string
385 if i > 0 and macro.value[i-1].value == '#':
386 macro.value[i] = copy.copy(macro.value[i])
387 macro.value[i].type = self.t_STRING
389 macro.str_patch.append((argnum,i-1))
392 elif (i > 0 and macro.value[i-1].value == '##'):
393 macro.patch.append(('c',argnum,i-1))
396 elif ((i+1) < len(macro.value) and macro.value[i+1].value == '##'):
397 macro.patch.append(('c',argnum,i))
402 macro.patch.append(('e',argnum,i))
403 elif macro.value[i].value == '##':
404 if macro.variadic and (i > 0) and (macro.value[i-1].value == ',') and \
405 ((i+1) < len(macro.value)) and (macro.value[i+1].type == self.t_ID) and \
406 (macro.value[i+1].value == macro.vararg):
407 macro.var_comma_patch.append(i-1)
409 macro.patch.sort(key=lambda x: x[2],reverse=True)
411 # ----------------------------------------------------------------------
412 # macro_expand_args()
414 # Given a Macro and list of arguments (each a token list), this method
415 # returns an expanded version of a macro. The return value is a token sequence
416 # representing the replacement macro tokens
417 # ----------------------------------------------------------------------
419 def macro_expand_args(self,macro,args):
420 # Make a copy of the macro token sequence
421 rep = [copy.copy(_x) for _x in macro.value]
423 # Make string expansion patches. These do not alter the length of the replacement sequence
426 for argnum, i in macro.str_patch:
427 if argnum not in str_expansion:
428 str_expansion[argnum] = ('"%s"' % "".join([x.value for x in args[argnum]])).replace("\\","\\\\")
429 rep[i] = copy.copy(rep[i])
430 rep[i].value = str_expansion[argnum]
432 # Make the variadic macro comma patch. If the variadic macro argument is empty, we get rid
434 if macro.variadic and not args[-1]:
435 for i in macro.var_comma_patch:
439 # Make all other patches. The order of these matters. It is assumed that the patch list
440 # has been sorted in reverse order of patch location since replacements will cause the
441 # size of the replacement sequence to expand from the patch point.
444 for ptype, argnum, i in macro.patch:
445 # Concatenation. Argument is left unexpanded
447 rep[i:i+1] = args[argnum]
448 # Normal expansion. Argument is macro expanded first
450 if argnum not in expanded:
451 expanded[argnum] = self.expand_macros(args[argnum])
452 rep[i:i+1] = expanded[argnum]
454 # Get rid of removed comma if necessary
456 rep = [_i for _i in rep if _i]
461 # ----------------------------------------------------------------------
464 # Given a list of tokens, this function performs macro expansion.
465 # The expanded argument is a dictionary that contains macros already
466 # expanded. This is used to prevent infinite recursion.
467 # ----------------------------------------------------------------------
469 def expand_macros(self,tokens,expanded=None):
473 while i < len(tokens):
475 if t.type == self.t_ID:
476 if t.value in self.macros and t.value not in expanded:
477 # Yes, we found a macro match
478 expanded[t.value] = True
480 m = self.macros[t.value]
483 ex = self.expand_macros([copy.copy(_x) for _x in m.value],expanded)
489 # A macro with arguments
491 while j < len(tokens) and tokens[j].type in self.t_WS:
493 if tokens[j].value == '(':
494 tokcount,args,positions = self.collect_args(tokens[j:])
495 if not m.variadic and len(args) != len(m.arglist):
496 self.error(self.source,t.lineno,"Macro %s requires %d arguments" % (t.value,len(m.arglist)))
498 elif m.variadic and len(args) < len(m.arglist)-1:
499 if len(m.arglist) > 2:
500 self.error(self.source,t.lineno,"Macro %s must have at least %d arguments" % (t.value, len(m.arglist)-1))
502 self.error(self.source,t.lineno,"Macro %s must have at least %d argument" % (t.value, len(m.arglist)-1))
506 if len(args) == len(m.arglist)-1:
509 args[len(m.arglist)-1] = tokens[j+positions[len(m.arglist)-1]:j+tokcount-1]
510 del args[len(m.arglist):]
512 # Get macro replacement text
513 rep = self.macro_expand_args(m,args)
514 rep = self.expand_macros(rep,expanded)
517 tokens[i:j+tokcount] = rep
519 del expanded[t.value]
521 elif t.value == '__LINE__':
522 t.type = self.t_INTEGER
523 t.value = self.t_INTEGER_TYPE(t.lineno)
528 # ----------------------------------------------------------------------
531 # Evaluate an expression token sequence for the purposes of evaluating
532 # integral expressions.
533 # ----------------------------------------------------------------------
535 def evalexpr(self,tokens):
536 # tokens = tokenize(line)
537 # Search for defined macros
539 while i < len(tokens):
540 if tokens[i].type == self.t_ID and tokens[i].value == 'defined':
544 while j < len(tokens):
545 if tokens[j].type in self.t_WS:
548 elif tokens[j].type == self.t_ID:
549 if tokens[j].value in self.macros:
553 if not needparen: break
554 elif tokens[j].value == '(':
556 elif tokens[j].value == ')':
559 self.error(self.source,tokens[i].lineno,"Malformed defined()")
561 tokens[i].type = self.t_INTEGER
562 tokens[i].value = self.t_INTEGER_TYPE(result)
565 tokens = self.expand_macros(tokens)
566 for i,t in enumerate(tokens):
567 if t.type == self.t_ID:
568 tokens[i] = copy.copy(t)
569 tokens[i].type = self.t_INTEGER
570 tokens[i].value = self.t_INTEGER_TYPE("0L")
571 elif t.type == self.t_INTEGER:
572 tokens[i] = copy.copy(t)
573 # Strip off any trailing suffixes
574 tokens[i].value = str(tokens[i].value)
575 while tokens[i].value[-1] not in "0123456789abcdefABCDEF":
576 tokens[i].value = tokens[i].value[:-1]
578 expr = "".join([str(x.value) for x in tokens])
579 expr = expr.replace("&&"," and ")
580 expr = expr.replace("||"," or ")
581 expr = expr.replace("!"," not ")
584 except StandardError:
585 self.error(self.source,tokens[0].lineno,"Couldn't evaluate expression")
589 # ----------------------------------------------------------------------
592 # Parse an input string/
593 # ----------------------------------------------------------------------
594 def parsegen(self,input,source=None):
596 # Replace trigraph sequences
598 lines = self.group_lines(t)
603 self.define("__FILE__ \"%s\"" % source)
612 for i,tok in enumerate(x):
613 if tok.type not in self.t_WS: break
615 # Preprocessor directive
618 if tok in self.t_WS and '\n' in tok.value:
621 dirtokens = self.tokenstrip(x[i+1:])
623 name = dirtokens[0].value
624 args = self.tokenstrip(dirtokens[1:])
631 for tok in self.expand_macros(chunk):
635 elif name == 'include':
637 for tok in self.expand_macros(chunk):
640 oldfile = self.macros['__FILE__']
641 for tok in self.include(args):
643 self.macros['__FILE__'] = oldfile
645 elif name == 'undef':
647 for tok in self.expand_macros(chunk):
651 elif name == 'ifdef':
652 ifstack.append((enable,iftrigger))
654 if not args[0].value in self.macros:
659 elif name == 'ifndef':
660 ifstack.append((enable,iftrigger))
662 if args[0].value in self.macros:
668 ifstack.append((enable,iftrigger))
670 result = self.evalexpr(args)
678 if ifstack[-1][0]: # We only pay attention if outer "if" allows this
679 if enable: # If already true, we flip enable False
681 elif not iftrigger: # If False, but not triggered yet, we'll check expression
682 result = self.evalexpr(args)
687 self.error(self.source,dirtokens[0].lineno,"Misplaced #elif")
698 self.error(self.source,dirtokens[0].lineno,"Misplaced #else")
700 elif name == 'endif':
702 enable,iftrigger = ifstack.pop()
704 self.error(self.source,dirtokens[0].lineno,"Misplaced #endif")
706 # Unknown preprocessor directive
714 for tok in self.expand_macros(chunk):
718 # ----------------------------------------------------------------------
721 # Implementation of file-inclusion
722 # ----------------------------------------------------------------------
724 def include(self,tokens):
725 # Try to extract the filename and then process an include file
729 if tokens[0].value != '<' and tokens[0].type != self.t_STRING:
730 tokens = self.expand_macros(tokens)
732 if tokens[0].value == '<':
735 while i < len(tokens):
736 if tokens[i].value == '>':
740 print "Malformed #include <...>"
742 filename = "".join([x.value for x in tokens[1:i]])
743 path = self.path + [""] + self.temp_path
744 elif tokens[0].type == self.t_STRING:
745 filename = tokens[0].value[1:-1]
746 path = self.temp_path + [""] + self.path
748 print "Malformed #include statement"
751 iname = os.path.join(p,filename)
753 data = open(iname,"r").read()
754 dname = os.path.dirname(iname)
756 self.temp_path.insert(0,dname)
757 for tok in self.parsegen(data,filename):
760 del self.temp_path[0]
765 print "Couldn't find '%s'" % filename
767 # ----------------------------------------------------------------------
771 # ----------------------------------------------------------------------
773 def define(self,tokens):
774 if isinstance(tokens,(str,unicode)):
775 tokens = self.tokenize(tokens)
785 m = Macro(name.value,[])
786 self.macros[name.value] = m
787 elif mtype.type in self.t_WS:
789 m = Macro(name.value,self.tokenstrip(linetok[2:]))
790 self.macros[name.value] = m
791 elif mtype.value == '(':
792 # A macro with arguments
793 tokcount, args, positions = self.collect_args(linetok[1:])
797 print "No more arguments may follow a variadic argument"
799 astr = "".join([str(_i.value) for _i in a])
802 a[0].type = self.t_ID
803 a[0].value = '__VA_ARGS__'
807 elif astr[-3:] == "..." and a[0].type == self.t_ID:
810 # If, for some reason, "." is part of the identifier, strip off the name for the purposes
812 if a[0].value[-3:] == '...':
813 a[0].value = a[0].value[:-3]
815 if len(a) > 1 or a[0].type != self.t_ID:
816 print "Invalid macro argument"
819 mvalue = self.tokenstrip(linetok[1+tokcount:])
821 while i < len(mvalue):
822 if i+1 < len(mvalue):
823 if mvalue[i].type in self.t_WS and mvalue[i+1].value == '##':
826 elif mvalue[i].value == '##' and mvalue[i+1].type in self.t_WS:
829 m = Macro(name.value,mvalue,[x[0].value for x in args],variadic)
830 self.macro_prescan(m)
831 self.macros[name.value] = m
833 print "Bad macro definition"
835 print "Bad macro definition"
837 # ----------------------------------------------------------------------
841 # ----------------------------------------------------------------------
843 def undef(self,tokens):
850 # ----------------------------------------------------------------------
854 # ----------------------------------------------------------------------
855 def parse(self,input,source=None,ignore={}):
857 self.parser = self.parsegen(input,source)
859 # ----------------------------------------------------------------------
862 # Method to return individual tokens
863 # ----------------------------------------------------------------------
867 tok = self.parser.next()
868 if tok.type not in self.ignore: return tok
869 except StopIteration:
873 if __name__ == '__main__':
874 import ply.lex as lex
879 f = open(sys.argv[1])
882 p = Preprocessor(lexer)
883 p.parse(input,sys.argv[1])