util/find_copyrights.py

   1 #!/usr/bin/env python3
   2
   3 import os
   4 import re
   5 import sys
   6
   7 from file_types import lang_type, find_files
   8
   9 mode_line = re.compile('(-\*- *mode:.* *-\*-)')
  10 shell_comment = re.compile(r'^\s*#')
  11 lisp_comment = re.compile(r';')
  12 cpp_comment = re.compile(r'//')
  13 c_comment_start = re.compile(r'/\*')
  14 c_comment_end   = re.compile(r'\*/')
  15 def find_copyright_block(lines, lang_type):
  16     start = None
  17     if lang_type in ('python', 'make', 'shell', 'perl', 'scons'):
  18         for i,line in enumerate(lines):
  19             if i == 0 and (line.startswith('#!') or mode_line.search(line)):
  20                 continue
  21
  22             if shell_comment.search(line):
  23                 if start is None:
  24                     start = i
  25             elif start is None:
  26                 if line.strip():
  27                     return
  28             else:
  29                 yield start, i-1
  30                 start = None
  31
  32     elif lang_type in ('lisp', ):
  33         for i,line in enumerate(lines):
  34             if i == 0 and mode_line.search(line):
  35                 continue
  36
  37             if lisp_comment.search(line):
  38                 if start is None:
  39                     start = i
  40             elif start is None:
  41                 if line.strip():
  42                     return
  43             else:
  44                 yield start, i-1
  45                 start = None
  46
  47     elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc',
  48                        'lex', 'yacc'):
  49         mode = None
  50         for i,line in enumerate(lines):
  51             if i == 0 and mode_line.search(line):
  52                 continue
  53
  54             if mode == 'C':
  55                 assert start is not None, 'on line %d' % (i + 1)
  56                 match = c_comment_end.search(line)
  57                 if match:
  58                     yield start, i
  59                     mode = None
  60                 continue
  61
  62             cpp_match = cpp_comment.search(line)
  63             c_match = c_comment_start.search(line)
  64
  65             if cpp_match:
  66                 assert not c_match, 'on line %d' % (i + 1)
  67                 if line[:cpp_match.start()].strip():
  68                     return
  69                 if mode is None:
  70                     mode = 'CPP'
  71                     start = i
  72                 else:
  73                     text = line[cpp_match.end():].lstrip()
  74                     if text.startswith("Copyright") > 0:
  75                         yield start, i-1
  76                         start = i
  77                 continue
  78             elif mode == 'CPP':
  79                 assert start is not None, 'on line %d' % (i + 1)
  80                 if not line.strip():
  81                     continue
  82                 yield start, i-1
  83                 mode = None
  84                 if not c_match:
  85                     return
  86
  87             if c_match:
  88                 assert mode is None, 'on line %d' % (i + 1)
  89                 mode = 'C'
  90                 start = i
  91
  92             if mode is None and line.strip():
  93                 return
  94
  95     else:
  96         raise AttributeError("Could not handle language %s" % lang_type)
  97
  98 date_range_re = re.compile(r'([0-9]{4})\s*-\s*([0-9]{4})')
  99 def process_dates(dates):
 100     dates = [ d.strip() for d in dates.split(',') ]
 101
 102     output = set()
 103     for date in dates:
 104         match = date_range_re.match(date)
 105         if match:
 106             f,l = [ int(d) for d in match.groups() ]
 107             for i in range(f, l+1):
 108                 output.add(i)
 109         else:
 110             try:
 111                 date = int(date)
 112                 output.add(date)
 113             except ValueError:
 114                 pass
 115
 116     return output
 117
 118 copyright_re = \
 119     re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)',
 120                re.DOTALL)
 121
 122 authors_re = re.compile(r'^[\s*#/]*Authors:\s*([A-z .]+)\s*$')
 123 more_authors_re = re.compile(r'^[\s*#/]*([A-z .]+)\s*$')
 124
 125 all_owners = set()
 126 def get_data(lang_type, lines):
 127     data = []
 128     last = None
 129     for start,end in find_copyright_block(lines, lang_type):
 130         joined = ''.join(lines[start:end+1])
 131         match = copyright_re.search(joined)
 132         if not match:
 133             continue
 134
 135         c,dates,owner = match.groups()
 136         dates = dates.strip()
 137         owner = owner.strip()
 138
 139         all_owners.add(owner)
 140         try:
 141             dates = process_dates(dates)
 142         except Exception:
 143             print(dates)
 144             print(owner)
 145             raise
 146
 147         authors = []
 148         for i in range(start,end+1):
 149             line = lines[i]
 150             if not authors:
 151                 match = authors_re.search(line)
 152                 if match:
 153                     authors.append(match.group(1).strip())
 154             else:
 155                 match = more_authors_re.search(line)
 156                 if not match:
 157                     for j in range(i, end+1):
 158                         line = lines[j].strip()
 159                         if not line:
 160                             end = j
 161                             break
 162                         if line.startswith('//'):
 163                             line = line[2:].lstrip()
 164                             if line:
 165                                 end = j - 1
 166                                 break
 167                     break
 168                 authors.append(match.group(1).strip())
 169
 170         info = (owner, dates, authors, start, end)
 171         data.append(info)
 172
 173     return data
 174
 175 def datestr(dates):
 176     dates = list(dates)
 177     dates.sort()
 178
 179     output = []
 180     def add_output(first, second):
 181         if first == second:
 182             output.append('%d' % (first))
 183         else:
 184             output.append('%d-%d' % (first, second))
 185
 186     first = dates.pop(0)
 187     second = first
 188     while dates:
 189         next = dates.pop(0)
 190         if next == second + 1:
 191             second = next
 192         else:
 193             add_output(first, second)
 194             first = next
 195             second = next
 196
 197     add_output(first, second)
 198
 199     return ','.join(output)
 200
 201 usage_str = """usage:
 202 %s [-v] <directory>"""
 203
 204 def usage(exitcode):
 205     print(usage_str % sys.argv[0])
 206     if exitcode is not None:
 207         sys.exit(exitcode)
 208
 209 if __name__ == '__main__':
 210     import getopt
 211
 212     show_counts = False
 213     ignore = set()
 214     verbose = False
 215     try:
 216         opts, args = getopt.getopt(sys.argv[1:], "ci:v")
 217     except getopt.GetoptError:
 218         usage(1)
 219
 220     for o,a in opts:
 221         if o == '-c':
 222             show_counts = True
 223         if o == '-i':
 224             ignore.add(a)
 225         if o == '-v':
 226             verbose = True
 227
 228     files = []
 229
 230     for base in args:
 231         if os.path.isfile(base):
 232             files += [ (base, lang_type(base)) ]
 233         elif os.path.isdir(base):
 234             files += find_files(base)
 235         else:
 236             raise AttributeError("can't access '%s'" %  base)
 237
 238     copyrights = {}
 239     counts = {}
 240
 241     for filename, lang in files:
 242         f = file(filename, 'r')
 243         lines = f.readlines()
 244         if not lines:
 245             continue
 246
 247         lines = [ line.rstrip('\r\n') for line in lines ]
 248
 249         lt = lang_type(filename, lines[0])
 250         try:
 251             data = get_data(lt, lines)
 252         except Exception as e:
 253             if verbose:
 254                 if len(e.args) == 1:
 255                     e.args = ('%s (%s))' % (e, filename), )
 256                 print("could not parse %s: %s" % (filename, e))
 257             continue
 258
 259         for owner, dates, authors, start, end in data:
 260             if owner not in copyrights:
 261                 copyrights[owner] = set()
 262             if owner not in counts:
 263                 counts[owner] = 0
 264
 265             copyrights[owner] |= dates
 266             counts[owner] += 1
 267
 268     info = [ (counts[o], d, o) for o,d in list(copyrights.items()) ]
 269
 270     for count,dates,owner in sorted(info, reverse=True):
 271         if show_counts:
 272             owner = '%s (%s files)' % (owner, count)
 273         print('Copyright (c) %s %s' % (datestr(dates), owner))