util/stats/dbinit.py

   1 # Copyright (c) 2003-2004 The Regents of The University of Michigan
   2 # All rights reserved.
   3 #
   4 # Redistribution and use in source and binary forms, with or without
   5 # modification, are permitted provided that the following conditions are
   6 # met: redistributions of source code must retain the above copyright
   7 # notice, this list of conditions and the following disclaimer;
   8 # redistributions in binary form must reproduce the above copyright
   9 # notice, this list of conditions and the following disclaimer in the
  10 # documentation and/or other materials provided with the distribution;
  11 # neither the name of the copyright holders nor the names of its
  12 # contributors may be used to endorse or promote products derived from
  13 # this software without specific prior written permission.
  14 #
  15 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26
  27 import MySQLdb
  28
  29 class MyDB(object):
  30     def __init__(self, options):
  31         self.name = options.db
  32         self.host = options.host
  33         self.user = options.user
  34         self.passwd = options.passwd
  35         self.mydb = None
  36         self.cursor = None
  37
  38     def admin(self):
  39         self.close()
  40         self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
  41                                     passwd=self.passwd)
  42         self.cursor = self.mydb.cursor()
  43
  44     def connect(self):
  45         self.close()
  46         self.mydb = MySQLdb.connect(db=self.name, host=self.host,
  47                                     user=self.user, passwd=self.passwd)
  48         self.cursor = self.mydb.cursor()
  49
  50     def close(self):
  51         if self.mydb is not None:
  52             self.mydb.close()
  53         self.cursor = None
  54
  55     def query(self, sql):
  56         self.cursor.execute(sql)
  57
  58     def drop(self):
  59         self.query('DROP DATABASE IF EXISTS %s' % self.name)
  60
  61     def create(self):
  62         self.query('CREATE DATABASE %s' % self.name)
  63
  64     def populate(self):
  65         #
  66         # Each run (or simulation) gets its own entry in the runs table to
  67         # group stats by where they were generated
  68         #
  69         # COLUMNS:
  70         #   'id' is a unique identifier for each run to be used in other
  71         #       tables.
  72         #   'name' is the user designated name for the data generated.  It is
  73         #       configured in the simulator.
  74         #   'user' identifies the user that generated the data for the given
  75         #       run.
  76         #   'project' another name to identify runs for a specific goal
  77         #   'date' is a timestamp for when the data was generated.  It can be
  78         #       used to easily expire data that was generated in the past.
  79         #   'expire' is a timestamp for when the data should be removed from
  80         #       the database so we don't have years worth of junk.
  81         #
  82         # INDEXES:
  83         #   'run' is indexed so you can find out details of a run if the run
  84         #       was retreived from the data table.
  85         #   'name' is indexed so that two all run names are forced to be unique
  86         #
  87         self.query('''
  88         CREATE TABLE runs(
  89             rn_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
  90             rn_name     VARCHAR(200)            NOT NULL,
  91             rn_sample   VARCHAR(32)             NOT NULL,
  92             rn_user     VARCHAR(32)             NOT NULL,
  93             rn_project  VARCHAR(100)            NOT NULL,
  94             rn_date     TIMESTAMP               NOT NULL,
  95             rn_expire   TIMESTAMP               NOT NULL,
  96             PRIMARY KEY (rn_id),
  97             UNIQUE (rn_name,rn_sample)
  98         ) TYPE=InnoDB''')
  99
 100         #
 101         # We keep the bin names separate so that the data table doesn't get
 102         # huge since bin names are frequently repeated.
 103         #
 104         # COLUMNS:
 105         #   'id' is the unique bin identifer.
 106         #   'name' is the string name for the bin.
 107         #
 108         # INDEXES:
 109         #   'bin' is indexed to get the name of a bin when data is retrieved
 110         #       via the data table.
 111         #   'name' is indexed to get the bin id for a named bin when you want
 112         #       to search the data table based on a specific bin.
 113         #
 114         self.query('''
 115         CREATE TABLE bins(
 116             bn_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 117             bn_name     VARCHAR(255)            NOT NULL,
 118             PRIMARY KEY(bn_id),
 119             UNIQUE (bn_name)
 120         ) TYPE=InnoDB''')
 121
 122         #
 123         # The stat table gives us all of the data for a particular stat.
 124         #
 125         # COLUMNS:
 126         #   'stat' is a unique identifier for each stat to be used in other
 127         #       tables for references.
 128         #   'name' is simply the simulator derived name for a given
 129         #       statistic.
 130         #   'descr' is the description of the statistic and what it tells
 131         #       you.
 132         #   'type' defines what the stat tells you.  Types are:
 133         #       SCALAR: A simple scalar statistic that holds one value
 134         #       VECTOR: An array of statistic values.  Such a something that
 135         #           is generated per-thread.  Vectors exist to give averages,
 136         #            pdfs, cdfs, means, standard deviations, etc across the
 137         #           stat values.
 138         #       DIST: Is a distribution of data.  When the statistic value is
 139         #            sampled, its value is counted in a particular bucket.
 140         #           Useful for keeping track of utilization of a resource.
 141         #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
 142         #       VECTORDIST: Can be used when the distribution needs to be
 143         #            factored out into a per-thread distribution of data for
 144         #            example.  It can still be summed across threads to find
 145         #           the total distribution.
 146         #       VECTOR2D: Can be used when you have a stat that is not only
 147         #           per-thread, but it is per-something else.  Like
 148         #           per-message type.
 149         #       FORMULA: This statistic is a formula, and its data must be
 150         #            looked up in the formula table, for indicating how to
 151         #           present its values.
 152         #   'subdata' is potentially used by any of the vector types to
 153         #       give a specific name to all of the data elements within a
 154         #       stat.
 155         #   'print' indicates whether this stat should be printed ever.
 156         #       (Unnamed stats don't usually get printed)
 157         #   'prereq' only print the stat if the prereq is not zero.
 158         #   'prec' number of decimal places to print
 159         #   'nozero' don't print zero values
 160         #   'nonan' don't print NaN values
 161         #   'total' for vector type stats, print the total.
 162         #   'pdf' for vector type stats, print the pdf.
 163         #   'cdf' for vector type stats, print the cdf.
 164         #
 165         #   The Following are for dist type stats:
 166         #   'min' is the minimum bucket value. Anything less is an underflow.
 167         #   'max' is the maximum bucket value. Anything more is an overflow.
 168         #   'bktsize' is the approximate number of entries in each bucket.
 169         #   'size' is the number of buckets. equal to (min/max)/bktsize.
 170         #
 171         # INDEXES:
 172         #   'stat' is indexed so that you can find out details about a stat
 173         #       if the stat id was retrieved from the data table.
 174         #   'name' is indexed so that you can simply look up data about a
 175         #       named stat.
 176         #
 177         self.query('''
 178         CREATE TABLE stats(
 179             st_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 180             st_name     VARCHAR(255)            NOT NULL,
 181             st_descr    TEXT                    NOT NULL,
 182             st_type     ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
 183                 "VECTOR2D", "FORMULA")  NOT NULL,
 184             st_print    BOOL                    NOT NULL,
 185             st_prereq   SMALLINT UNSIGNED       NOT NULL,
 186             st_prec     TINYINT                 NOT NULL,
 187             st_nozero   BOOL                    NOT NULL,
 188             st_nonan    BOOL                    NOT NULL,
 189             st_total    BOOL                    NOT NULL,
 190             st_pdf      BOOL                    NOT NULL,
 191             st_cdf      BOOL                    NOT NULL,
 192             st_min      DOUBLE                  NOT NULL,
 193             st_max      DOUBLE                  NOT NULL,
 194             st_bktsize  DOUBLE                  NOT NULL,
 195             st_size     SMALLINT UNSIGNED       NOT NULL,
 196             PRIMARY KEY (st_id),
 197             UNIQUE (st_name)
 198         ) TYPE=InnoDB''')
 199
 200         #
 201         # This is the main table of data for stats.
 202         #
 203         # COLUMNS:
 204         #   'stat' refers to the stat field given in the stat table.
 205         #
 206         #   'x' referrs to the first dimension of a multi-dimensional stat. For
 207         #       a vector, x will start at 0 and increase for each vector
 208         #       element.
 209         #       For a distribution:
 210         #       -1: sum (for calculating standard deviation)
 211         #       -2: sum of squares (for calculating standard deviation)
 212         #       -3: total number of samples taken (for calculating
 213         #           standard deviation)
 214         #       -4: minimum value
 215         #       -5: maximum value
 216         #       -6: underflow
 217         #       -7: overflow
 218         #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
 219         #       dimension.
 220         #   'run' is the run that the data was generated from.  Details up in
 221         #       the run table
 222         #   'tick' is a timestamp generated by the simulator.
 223         #   'bin' is the name of the bin that the data was generated in, if
 224         #       any.
 225         #   'data' is the actual stat value.
 226         #
 227         # INDEXES:
 228         #   'stat' is indexed so that a user can find all of the data for a
 229         #       particular stat. It is not unique, because that specific stat
 230         #       can be found in many runs, bins, and samples, in addition to
 231         #       having entries for the mulidimensional cases.
 232         #   'run' is indexed to allow a user to remove all of the data for a
 233         #       particular execution run.  It can also be used to allow the
 234         #       user to print out all of the data for a given run.
 235         #
 236         self.query('''
 237         CREATE TABLE data(
 238             dt_stat     SMALLINT UNSIGNED       NOT NULL,
 239             dt_x        SMALLINT                NOT NULL,
 240             dt_y        SMALLINT                NOT NULL,
 241             dt_run      SMALLINT UNSIGNED       NOT NULL,
 242             dt_tick     BIGINT UNSIGNED         NOT NULL,
 243             dt_bin      SMALLINT UNSIGNED       NOT NULL,
 244             dt_data     DOUBLE                  NOT NULL,
 245             INDEX (dt_stat),
 246             INDEX (dt_run),
 247             UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin)
 248         ) TYPE=InnoDB;''')
 249
 250         #
 251         # Names and descriptions for multi-dimensional stats (vectors, etc.)
 252         # are stored here instead of having their own entry in the statistics
 253         # table. This allows all parts of a single stat to easily share a
 254         # single id.
 255         #
 256         # COLUMNS:
 257         #   'stat' is the unique stat identifier from the stat table.
 258         #   'x' is the first dimension for multi-dimensional stats
 259         #       corresponding to the data table above.
 260         #   'y' is the second dimension for multi-dimensional stats
 261         #       corresponding to the data table above.
 262         #   'name' is the specific subname for the unique stat,x,y combination.
 263         #   'descr' is the specific description for the uniqe stat,x,y
 264         #        combination.
 265         #
 266         # INDEXES:
 267         #   'stat' is indexed so you can get the subdata for a specific stat.
 268         #
 269         self.query('''
 270         CREATE TABLE subdata(
 271             sd_stat     SMALLINT UNSIGNED       NOT NULL,
 272             sd_x        SMALLINT                NOT NULL,
 273             sd_y        SMALLINT                NOT NULL,
 274             sd_name     VARCHAR(255)            NOT NULL,
 275             sd_descr    TEXT,
 276             UNIQUE (sd_stat,sd_x,sd_y)
 277         ) TYPE=InnoDB''')
 278
 279
 280         #
 281         # The formula table is maintained separately from the data table
 282         # because formula data, unlike other stat data cannot be represented
 283         # there.
 284         #
 285         # COLUMNS:
 286         #   'stat' refers to the stat field generated in the stat table.
 287         #   'formula' is the actual string representation of the formula
 288         #       itself.
 289         #
 290         # INDEXES:
 291         #   'stat' is indexed so that you can just look up a formula.
 292         #
 293         self.query('''
 294         CREATE TABLE formulas(
 295             fm_stat     SMALLINT UNSIGNED       NOT NULL,
 296             fm_formula  BLOB                    NOT NULL,
 297             PRIMARY KEY(fm_stat)
 298         ) TYPE=InnoDB''')
 299
 300         #
 301         # Each stat used in each formula is kept in this table.  This way, if
 302         # you want to print out a particular formula, you can simply find out
 303         # which stats you need by looking in this table.  Additionally, when
 304         # you remove a stat from the stats table and data table, you remove
 305         # any references to the formula in this table.  When a formula is no
 306         # longer referred to, you remove its entry.
 307         #
 308         # COLUMNS:
 309         #   'stat' is the stat id from the stat table above.
 310         #   'child' is the stat id of a stat that is used for this formula.
 311         #       There may be many children for any given 'stat' (formula)
 312         #
 313         # INDEXES:
 314         #   'stat' is indexed so you can look up all of the children for a
 315         #       particular stat.
 316         #   'child' is indexed so that you can remove an entry when a stat is
 317         #       removed.
 318         #
 319         self.query('''
 320         CREATE TABLE formula_ref(
 321             fr_stat     SMALLINT UNSIGNED       NOT NULL,
 322             fr_run      SMALLINT UNSIGNED       NOT NULL,
 323             UNIQUE (fr_stat,fr_run),
 324             INDEX (fr_stat),
 325             INDEX (fr_run)
 326         ) TYPE=InnoDB''')
 327
 328         # COLUMNS:
 329         #   'event' is the unique event id from the event_desc table
 330         #   'run' is simulation run id that this event took place in
 331         #   'tick' is the tick when the event happened
 332         #
 333         # INDEXES:
 334         #   'event' is indexed so you can look up all occurences of a
 335         #       specific event
 336         #   'run' is indexed so you can find all events in a run
 337         #   'tick' is indexed because we want the unique thing anyway
 338         #   'event,run,tick' is unique combination
 339         self.query('''
 340         CREATE TABLE events(
 341             ev_event    SMALLINT UNSIGNED       NOT NULL,
 342             ev_run      SMALLINT UNSIGNED       NOT NULL,
 343             ev_tick     BIGINT   UNSIGNED       NOT NULL,
 344             INDEX(ev_event),
 345             INDEX(ev_run),
 346             INDEX(ev_tick),
 347             UNIQUE(ev_event,ev_run,ev_tick)
 348         ) TYPE=InnoDB''')
 349
 350         # COLUMNS:
 351         #   'id' is the unique description id
 352         #   'name' is the name of the event that occurred
 353         #
 354         # INDEXES:
 355         #   'id' is indexed because it is the primary key and is what you use
 356         #       to look up the descriptions
 357         #   'name' is indexed so one can find the event based on name
 358         #
 359         self.query('''
 360         CREATE TABLE event_names(
 361             en_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 362             en_name     VARCHAR(255)            NOT NULL,
 363             PRIMARY KEY (en_id),
 364             UNIQUE (en_name)
 365         ) TYPE=InnoDB''')
 366
 367     def clean(self):
 368         self.query('''
 369         DELETE data
 370         FROM data
 371         LEFT JOIN runs ON dt_run=rn_id
 372         WHERE rn_id IS NULL''')
 373
 374         self.query('''
 375         DELETE formula_ref
 376         FROM formula_ref
 377         LEFT JOIN runs ON fr_run=rn_id
 378         WHERE rn_id IS NULL''')
 379
 380         self.query('''
 381         DELETE formulas
 382         FROM formulas
 383         LEFT JOIN formula_ref ON fm_stat=fr_stat
 384         WHERE fr_stat IS NULL''')
 385
 386         self.query('''
 387         DELETE stats
 388         FROM stats
 389         LEFT JOIN data ON st_id=dt_stat
 390         WHERE dt_stat IS NULL''')
 391
 392         self.query('''
 393         DELETE subdata
 394         FROM subdata
 395         LEFT JOIN data ON sd_stat=dt_stat
 396         WHERE dt_stat IS NULL''')
 397
 398         self.query('''
 399         DELETE bins
 400         FROM bins
 401         LEFT JOIN data ON bn_id=dt_bin
 402         WHERE dt_bin IS NULL''')
 403
 404         self.query('''
 405         DELETE events
 406         FROM events
 407         LEFT JOIN runs ON ev_run=rn_id
 408         WHERE rn_id IS NULL''')
 409
 410         self.query('''
 411         DELETE event_names
 412         FROM event_names
 413         LEFT JOIN events ON en_id=ev_event
 414         WHERE ev_event IS NULL''')