util/stats/db.py

   1 # Copyright (c) 2003-2004 The Regents of The University of Michigan
   2 # All rights reserved.
   3 #
   4 # Redistribution and use in source and binary forms, with or without
   5 # modification, are permitted provided that the following conditions are
   6 # met: redistributions of source code must retain the above copyright
   7 # notice, this list of conditions and the following disclaimer;
   8 # redistributions in binary form must reproduce the above copyright
   9 # notice, this list of conditions and the following disclaimer in the
  10 # documentation and/or other materials provided with the distribution;
  11 # neither the name of the copyright holders nor the names of its
  12 # contributors may be used to endorse or promote products derived from
  13 # this software without specific prior written permission.
  14 #
  15 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26
  27 import MySQLdb, re, string
  28
  29 def statcmp(a, b):
  30     v1 = a.split('.')
  31     v2 = b.split('.')
  32
  33     last = min(len(v1), len(v2)) - 1
  34     for i,j in zip(v1[0:last], v2[0:last]):
  35         if i != j:
  36             return cmp(i, j)
  37
  38     # Special compare for last element.
  39     if len(v1) == len(v2):
  40         return cmp(v1[last], v2[last])
  41     else:
  42         return cmp(len(v1), len(v2))
  43
  44 class RunData:
  45     def __init__(self, row):
  46         self.run = int(row[0])
  47         self.name = row[1]
  48         self.user = row[2]
  49         self.project = row[3]
  50
  51 class SubData:
  52     def __init__(self, row):
  53         self.stat = int(row[0])
  54         self.x = int(row[1])
  55         self.y = int(row[2])
  56         self.name = row[3]
  57         self.descr = row[4]
  58
  59 class Data:
  60     def __init__(self, row):
  61         if len(row) != 5:
  62             raise 'stat db error'
  63         self.stat = int(row[0])
  64         self.run = int(row[1])
  65         self.x = int(row[2])
  66         self.y = int(row[3])
  67         self.data = float(row[4])
  68
  69     def __repr__(self):
  70         return '''Data(['%d', '%d', '%d', '%d', '%f'])''' % ( self.stat,
  71             self.run, self.x, self.y, self.data)
  72
  73 class StatData(object):
  74     def __init__(self, row):
  75         self.stat = int(row[0])
  76         self.name = row[1]
  77         self.desc = row[2]
  78         self.type = row[3]
  79         self.prereq = int(row[5])
  80         self.precision = int(row[6])
  81
  82         import flags
  83         self.flags = 0
  84         if int(row[4]): self.flags |= flags.printable
  85         if int(row[7]): self.flags |= flags.nozero
  86         if int(row[8]): self.flags |= flags.nonan
  87         if int(row[9]): self.flags |= flags.total
  88         if int(row[10]): self.flags |= flags.pdf
  89         if int(row[11]): self.flags |= flags.cdf
  90
  91         if self.type == 'DIST' or self.type == 'VECTORDIST':
  92             self.min = float(row[12])
  93             self.max = float(row[13])
  94             self.bktsize = float(row[14])
  95             self.size = int(row[15])
  96
  97         if self.type == 'FORMULA':
  98             self.formula = self.db.allFormulas[self.stat]
  99
 100 class Node(object):
 101     def __init__(self, name):
 102         self.name = name
 103     def __str__(self):
 104         return self.name
 105
 106 class Result(object):
 107     def __init__(self, x, y):
 108         self.data = {}
 109         self.x = x
 110         self.y = y
 111
 112     def __contains__(self, run):
 113         return run in self.data
 114
 115     def __getitem__(self, run):
 116         if run not in self.data:
 117             self.data[run] = [ [ 0.0 ] * self.y for i in xrange(self.x) ]
 118         return self.data[run]
 119
 120 class Database(object):
 121     def __init__(self):
 122         self.host = 'zizzer.pool'
 123         self.user = ''
 124         self.passwd = ''
 125         self.db = 'm5stats'
 126         self.cursor = None
 127
 128         self.allStats = []
 129         self.allStatIds = {}
 130         self.allStatNames = {}
 131
 132         self.allSubData = {}
 133
 134         self.allRuns = []
 135         self.allRunIds = {}
 136         self.allRunNames = {}
 137
 138         self.allBins = []
 139         self.allBinIds = {}
 140         self.allBinNames = {}
 141
 142         self.allFormulas = {}
 143
 144         self.stattop = {}
 145         self.statdict = {}
 146         self.statlist = []
 147
 148         self.mode = 'sum';
 149         self.runs = None
 150         self.bins = None
 151         self.ticks = None
 152         self.method = 'sum'
 153         self._method = type(self).sum
 154
 155     def get(self, job, stat, system=None):
 156         run = self.allRunNames.get(str(job), None)
 157         if run is None:
 158             return None
 159
 160         from info import ProxyError, scalar, vector, value, values, total, len
 161         if system is None and hasattr(job, 'system'):
 162             system = job.system
 163
 164         if system is not None:
 165             stat.system = self[system]
 166         try:
 167             if scalar(stat):
 168                 return value(stat, run.run)
 169             if vector(stat):
 170                 return values(stat, run.run)
 171         except ProxyError:
 172             return None
 173
 174         return None
 175
 176     def query(self, sql):
 177         self.cursor.execute(sql)
 178
 179     def update_dict(self, dict):
 180         dict.update(self.stattop)
 181
 182     def append(self, stat):
 183         statname = re.sub(':', '__', stat.name)
 184         path = string.split(statname, '.')
 185         pathtop = path[0]
 186         fullname = ''
 187
 188         x = self
 189         while len(path) > 1:
 190             name = path.pop(0)
 191             if not x.__dict__.has_key(name):
 192                 x.__dict__[name] = Node(fullname + name)
 193             x = x.__dict__[name]
 194             fullname = '%s%s.' % (fullname, name)
 195
 196         name = path.pop(0)
 197         x.__dict__[name] = stat
 198
 199         self.stattop[pathtop] = self.__dict__[pathtop]
 200         self.statdict[statname] = stat
 201         self.statlist.append(statname)
 202
 203     def connect(self):
 204         # connect
 205         self.thedb = MySQLdb.connect(db=self.db,
 206                                      host=self.host,
 207                                      user=self.user,
 208                                      passwd=self.passwd)
 209
 210         # create a cursor
 211         self.cursor = self.thedb.cursor()
 212
 213         self.query('''select rn_id,rn_name,rn_sample,rn_user,rn_project
 214                    from runs''')
 215         for result in self.cursor.fetchall():
 216             run = RunData(result);
 217             self.allRuns.append(run)
 218             self.allRunIds[run.run] = run
 219             self.allRunNames[run.name] = run
 220
 221         self.query('select * from bins')
 222         for id,name in self.cursor.fetchall():
 223             self.allBinIds[int(id)] = name
 224             self.allBinNames[name] = int(id)
 225
 226         self.query('select sd_stat,sd_x,sd_y,sd_name,sd_descr from subdata')
 227         for result in self.cursor.fetchall():
 228             subdata = SubData(result)
 229             if self.allSubData.has_key(subdata.stat):
 230                 self.allSubData[subdata.stat].append(subdata)
 231             else:
 232                 self.allSubData[subdata.stat] = [ subdata ]
 233
 234         self.query('select * from formulas')
 235         for id,formula in self.cursor.fetchall():
 236             self.allFormulas[int(id)] = formula.tostring()
 237
 238         StatData.db = self
 239         self.query('select * from stats')
 240         import info
 241         for result in self.cursor.fetchall():
 242             stat = info.NewStat(self, StatData(result))
 243             self.append(stat)
 244             self.allStats.append(stat)
 245             self.allStatIds[stat.stat] = stat
 246             self.allStatNames[stat.name] = stat
 247
 248     # Name: listbins
 249     # Desc: Prints all bins matching regex argument, if no argument
 250     #       is given all bins are returned
 251     def listBins(self, regex='.*'):
 252         print '%-50s %-10s' % ('bin name', 'id')
 253         print '-' * 61
 254         names = self.allBinNames.keys()
 255         names.sort()
 256         for name in names:
 257             id = self.allBinNames[name]
 258             print '%-50s %-10d' % (name, id)
 259
 260     # Name: listruns
 261     # Desc: Prints all runs matching a given user, if no argument
 262     #       is given all runs are returned
 263     def listRuns(self, user=None):
 264         print '%-40s %-10s %-5s' % ('run name', 'user', 'id')
 265         print '-' * 62
 266         for run in self.allRuns:
 267             if user == None or user == run.user:
 268                 print '%-40s %-10s %-10d' % (run.name, run.user, run.run)
 269
 270     # Name: listTicks
 271     # Desc: Prints all samples for a given run
 272     def listTicks(self, runs=None):
 273         print "tick"
 274         print "----------------------------------------"
 275         sql = 'select distinct dt_tick from data where dt_stat=1180 and ('
 276         if runs != None:
 277             first = True
 278             for run in runs:
 279                if first:
 280             #       sql += ' where'
 281                    first = False
 282                else:
 283                    sql += ' or'
 284                sql += ' dt_run=%s' % run.run
 285             sql += ')'
 286         self.query(sql)
 287         for r in self.cursor.fetchall():
 288             print r[0]
 289
 290     # Name: retTicks
 291     # Desc: Prints all samples for a given run
 292     def retTicks(self, runs=None):
 293         sql = 'select distinct dt_tick from data where dt_stat=1180 and ('
 294         if runs != None:
 295             first = True
 296             for run in runs:
 297                if first:
 298                    first = False
 299                else:
 300                    sql += ' or'
 301                sql += ' dt_run=%s' % run.run
 302             sql += ')'
 303         self.query(sql)
 304         ret = []
 305         for r in self.cursor.fetchall():
 306             ret.append(r[0])
 307         return ret
 308
 309     # Name: liststats
 310     # Desc: Prints all statistics that appear in the database,
 311     #         the optional argument is a regular expression that can
 312     #         be used to prune the result set
 313     def listStats(self, regex=None):
 314         print '%-60s %-8s %-10s' % ('stat name', 'id', 'type')
 315         print '-' * 80
 316
 317         rx = None
 318         if regex != None:
 319             rx = re.compile(regex)
 320
 321         stats = [ stat.name for stat in self.allStats ]
 322         stats.sort(statcmp)
 323         for stat in stats:
 324             stat = self.allStatNames[stat]
 325             if rx == None or rx.match(stat.name):
 326                 print '%-60s %-8s %-10s' % (stat.name, stat.stat, stat.type)
 327
 328     # Name: liststats
 329     # Desc: Prints all statistics that appear in the database,
 330     #         the optional argument is a regular expression that can
 331     #         be used to prune the result set
 332     def listFormulas(self, regex=None):
 333         print '%-60s %s' % ('formula name', 'formula')
 334         print '-' * 80
 335
 336         rx = None
 337         if regex != None:
 338             rx = re.compile(regex)
 339
 340         stats = [ stat.name for stat in self.allStats ]
 341         stats.sort(statcmp)
 342         for stat in stats:
 343             stat = self.allStatNames[stat]
 344             if stat.type == 'FORMULA' and (rx == None or rx.match(stat.name)):
 345                 print '%-60s %s' % (stat.name, self.allFormulas[stat.stat])
 346
 347     def getStat(self, stats):
 348         if type(stats) is not list:
 349             stats = [ stats ]
 350
 351         ret = []
 352         for stat in stats:
 353             if type(stat) is int:
 354                 ret.append(self.allStatIds[stat])
 355
 356             if type(stat) is str:
 357                 rx = re.compile(stat)
 358                 for stat in self.allStats:
 359                     if rx.match(stat.name):
 360                         ret.append(stat)
 361         return ret
 362
 363     def getBin(self, bins):
 364         if type(bins) is not list:
 365             bins = [ bins ]
 366
 367         ret = []
 368         for bin in bins:
 369             if type(bin) is int:
 370                 ret.append(bin)
 371             elif type(bin) is str:
 372                 ret.append(self.allBinNames[bin])
 373             else:
 374                 for name,id in self.allBinNames.items():
 375                     if bin.match(name):
 376                         ret.append(id)
 377
 378         return ret
 379
 380     def getNotBin(self, bin):
 381         map = {}
 382         for bin in getBin(bin):
 383             map[bin] = 1
 384
 385         ret = []
 386         for bin in self.allBinIds.keys():
 387             if not map.has_key(bin):
 388                 ret.append(bin)
 389
 390         return ret
 391
 392     #########################################
 393     # get the data
 394     #
 395     def inner(self, op, stat, bins, ticks, group=False):
 396         sql = 'select '
 397         sql += 'dt_stat as stat, '
 398         sql += 'dt_run as run, '
 399         sql += 'dt_x as x, '
 400         sql += 'dt_y as y, '
 401         if group:
 402             sql += 'dt_tick as tick, '
 403         sql += '%s(dt_data) as data ' % op
 404         sql += 'from data '
 405         sql += 'where '
 406
 407         if isinstance(stat, list):
 408             val = ' or '.join([ 'dt_stat=%d' % s.stat for s in stat ])
 409             sql += ' (%s)' % val
 410         else:
 411             sql += ' dt_stat=%d' % stat.stat
 412
 413         if self.runs != None and len(self.runs):
 414             val = ' or '.join([ 'dt_run=%d' % r for r in self.runs ])
 415             sql += ' and (%s)' % val
 416
 417         if bins != None and len(bins):
 418             val = ' or '.join([ 'dt_bin=%d' % b for b in bins ])
 419             sql += ' and (%s)' % val
 420
 421         if ticks != None and len(ticks):
 422             val = ' or '.join([ 'dt_tick=%d' % s for s in ticks ])
 423             sql += ' and (%s)' % val
 424
 425         sql += ' group by dt_stat,dt_run,dt_x,dt_y'
 426         if group:
 427             sql += ',dt_tick'
 428         return sql
 429
 430     def outer(self, op_out, op_in, stat, bins, ticks):
 431         sql = self.inner(op_in, stat, bins, ticks, True)
 432         sql = 'select stat,run,x,y,%s(data) from (%s) as tb ' % (op_out, sql)
 433         sql += 'group by stat,run,x,y'
 434         return sql
 435
 436     # Name: sum
 437     # Desc: given a run, a stat and an array of samples and bins,
 438     #        sum all the bins and then get the standard deviation of the
 439     #        samples for non-binned runs. This will just return the average
 440     #        of samples, however a bin array still must be passed
 441     def sum(self, stat, bins, ticks):
 442         return self.inner('sum', stat, bins, ticks)
 443
 444     # Name: avg
 445     # Desc: given a run, a stat and an array of samples and bins,
 446     #        sum all the bins and then average the samples for non-binned
 447     #        runs this will just return the average of samples, however
 448     #        a bin array still must be passed
 449     def avg(self, stat, bins, ticks):
 450         return self.outer('avg', 'sum', stat, bins, ticks)
 451
 452     # Name: stdev
 453     # Desc: given a run, a stat and an array of samples and bins,
 454     #        sum all the bins and then get the standard deviation of the
 455     #        samples for non-binned runs. This will just return the average
 456     #        of samples, however a bin array still must be passed
 457     def stdev(self, stat, bins, ticks):
 458         return self.outer('stddev', 'sum', stat, bins, ticks)
 459
 460     def __setattr__(self, attr, value):
 461         super(Database, self).__setattr__(attr, value)
 462         if attr != 'method':
 463             return
 464
 465         if value == 'sum':
 466             self._method = self.sum
 467         elif value == 'avg':
 468             self._method = self.avg
 469         elif value == 'stdev':
 470             self._method = self.stdev
 471         else:
 472             raise AttributeError, "can only set get to: sum | avg | stdev"
 473
 474     def data(self, stat, bins=None, ticks=None):
 475         if bins is None:
 476             bins = self.bins
 477         if ticks is None:
 478             ticks = self.ticks
 479         sql = self._method(self, stat, bins, ticks)
 480         self.query(sql)
 481
 482         runs = {}
 483         xmax = 0
 484         ymax = 0
 485         for x in self.cursor.fetchall():
 486             data = Data(x)
 487             if not runs.has_key(data.run):
 488                 runs[data.run] = {}
 489             if not runs[data.run].has_key(data.x):
 490                 runs[data.run][data.x] = {}
 491
 492             xmax = max(xmax, data.x)
 493             ymax = max(ymax, data.y)
 494             runs[data.run][data.x][data.y] = data.data
 495
 496         results = Result(xmax + 1, ymax + 1)
 497         for run,data in runs.iteritems():
 498             result = results[run]
 499             for x,ydata in data.iteritems():
 500                 for y,data in ydata.iteritems():
 501                     result[x][y] = data
 502         return results
 503
 504     def __getitem__(self, key):
 505         return self.stattop[key]