util/stats/dbinit.py

   1 # Copyright (c) 2003-2004 The Regents of The University of Michigan
   2 # All rights reserved.
   3 #
   4 # Redistribution and use in source and binary forms, with or without
   5 # modification, are permitted provided that the following conditions are
   6 # met: redistributions of source code must retain the above copyright
   7 # notice, this list of conditions and the following disclaimer;
   8 # redistributions in binary form must reproduce the above copyright
   9 # notice, this list of conditions and the following disclaimer in the
  10 # documentation and/or other materials provided with the distribution;
  11 # neither the name of the copyright holders nor the names of its
  12 # contributors may be used to endorse or promote products derived from
  13 # this software without specific prior written permission.
  14 #
  15 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 #
  27 # Authors: Nathan Binkert
  28
  29 import MySQLdb
  30
  31 class MyDB(object):
  32     def __init__(self, options):
  33         self.name = options.db
  34         self.host = options.host
  35         self.user = options.user
  36         self.passwd = options.passwd
  37         self.mydb = None
  38         self.cursor = None
  39
  40     def admin(self):
  41         self.close()
  42         self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
  43                                     passwd=self.passwd)
  44         self.cursor = self.mydb.cursor()
  45
  46     def connect(self):
  47         self.close()
  48         self.mydb = MySQLdb.connect(db=self.name, host=self.host,
  49                                     user=self.user, passwd=self.passwd)
  50         self.cursor = self.mydb.cursor()
  51
  52     def close(self):
  53         if self.mydb is not None:
  54             self.mydb.close()
  55         self.cursor = None
  56
  57     def query(self, sql):
  58         self.cursor.execute(sql)
  59
  60     def drop(self):
  61         self.query('DROP DATABASE IF EXISTS %s' % self.name)
  62
  63     def create(self):
  64         self.query('CREATE DATABASE %s' % self.name)
  65
  66     def populate(self):
  67         #
  68         # Each run (or simulation) gets its own entry in the runs table to
  69         # group stats by where they were generated
  70         #
  71         # COLUMNS:
  72         #   'id' is a unique identifier for each run to be used in other
  73         #       tables.
  74         #   'name' is the user designated name for the data generated.  It is
  75         #       configured in the simulator.
  76         #   'user' identifies the user that generated the data for the given
  77         #       run.
  78         #   'project' another name to identify runs for a specific goal
  79         #   'date' is a timestamp for when the data was generated.  It can be
  80         #       used to easily expire data that was generated in the past.
  81         #   'expire' is a timestamp for when the data should be removed from
  82         #       the database so we don't have years worth of junk.
  83         #
  84         # INDEXES:
  85         #   'run' is indexed so you can find out details of a run if the run
  86         #       was retreived from the data table.
  87         #   'name' is indexed so that two all run names are forced to be unique
  88         #
  89         self.query('''
  90         CREATE TABLE runs(
  91             rn_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
  92             rn_name     VARCHAR(200)            NOT NULL,
  93             rn_sample   VARCHAR(32)             NOT NULL,
  94             rn_user     VARCHAR(32)             NOT NULL,
  95             rn_project  VARCHAR(100)            NOT NULL,
  96             rn_date     TIMESTAMP               NOT NULL,
  97             rn_expire   TIMESTAMP               NOT NULL,
  98             PRIMARY KEY (rn_id),
  99             UNIQUE (rn_name,rn_sample)
 100         ) TYPE=InnoDB''')
 101
 102         #
 103         # We keep the bin names separate so that the data table doesn't get
 104         # huge since bin names are frequently repeated.
 105         #
 106         # COLUMNS:
 107         #   'id' is the unique bin identifer.
 108         #   'name' is the string name for the bin.
 109         #
 110         # INDEXES:
 111         #   'bin' is indexed to get the name of a bin when data is retrieved
 112         #       via the data table.
 113         #   'name' is indexed to get the bin id for a named bin when you want
 114         #       to search the data table based on a specific bin.
 115         #
 116         self.query('''
 117         CREATE TABLE bins(
 118             bn_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 119             bn_name     VARCHAR(255)            NOT NULL,
 120             PRIMARY KEY(bn_id),
 121             UNIQUE (bn_name)
 122         ) TYPE=InnoDB''')
 123
 124         #
 125         # The stat table gives us all of the data for a particular stat.
 126         #
 127         # COLUMNS:
 128         #   'stat' is a unique identifier for each stat to be used in other
 129         #       tables for references.
 130         #   'name' is simply the simulator derived name for a given
 131         #       statistic.
 132         #   'descr' is the description of the statistic and what it tells
 133         #       you.
 134         #   'type' defines what the stat tells you.  Types are:
 135         #       SCALAR: A simple scalar statistic that holds one value
 136         #       VECTOR: An array of statistic values.  Such a something that
 137         #           is generated per-thread.  Vectors exist to give averages,
 138         #            pdfs, cdfs, means, standard deviations, etc across the
 139         #           stat values.
 140         #       DIST: Is a distribution of data.  When the statistic value is
 141         #            sampled, its value is counted in a particular bucket.
 142         #           Useful for keeping track of utilization of a resource.
 143         #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
 144         #       VECTORDIST: Can be used when the distribution needs to be
 145         #            factored out into a per-thread distribution of data for
 146         #            example.  It can still be summed across threads to find
 147         #           the total distribution.
 148         #       VECTOR2D: Can be used when you have a stat that is not only
 149         #           per-thread, but it is per-something else.  Like
 150         #           per-message type.
 151         #       FORMULA: This statistic is a formula, and its data must be
 152         #            looked up in the formula table, for indicating how to
 153         #           present its values.
 154         #   'subdata' is potentially used by any of the vector types to
 155         #       give a specific name to all of the data elements within a
 156         #       stat.
 157         #   'print' indicates whether this stat should be printed ever.
 158         #       (Unnamed stats don't usually get printed)
 159         #   'prereq' only print the stat if the prereq is not zero.
 160         #   'prec' number of decimal places to print
 161         #   'nozero' don't print zero values
 162         #   'nonan' don't print NaN values
 163         #   'total' for vector type stats, print the total.
 164         #   'pdf' for vector type stats, print the pdf.
 165         #   'cdf' for vector type stats, print the cdf.
 166         #
 167         #   The Following are for dist type stats:
 168         #   'min' is the minimum bucket value. Anything less is an underflow.
 169         #   'max' is the maximum bucket value. Anything more is an overflow.
 170         #   'bktsize' is the approximate number of entries in each bucket.
 171         #   'size' is the number of buckets. equal to (min/max)/bktsize.
 172         #
 173         # INDEXES:
 174         #   'stat' is indexed so that you can find out details about a stat
 175         #       if the stat id was retrieved from the data table.
 176         #   'name' is indexed so that you can simply look up data about a
 177         #       named stat.
 178         #
 179         self.query('''
 180         CREATE TABLE stats(
 181             st_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 182             st_name     VARCHAR(255)            NOT NULL,
 183             st_descr    TEXT                    NOT NULL,
 184             st_type     ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
 185                 "VECTOR2D", "FORMULA")  NOT NULL,
 186             st_print    BOOL                    NOT NULL,
 187             st_prereq   SMALLINT UNSIGNED       NOT NULL,
 188             st_prec     TINYINT                 NOT NULL,
 189             st_nozero   BOOL                    NOT NULL,
 190             st_nonan    BOOL                    NOT NULL,
 191             st_total    BOOL                    NOT NULL,
 192             st_pdf      BOOL                    NOT NULL,
 193             st_cdf      BOOL                    NOT NULL,
 194             st_min      DOUBLE                  NOT NULL,
 195             st_max      DOUBLE                  NOT NULL,
 196             st_bktsize  DOUBLE                  NOT NULL,
 197             st_size     SMALLINT UNSIGNED       NOT NULL,
 198             PRIMARY KEY (st_id),
 199             UNIQUE (st_name)
 200         ) TYPE=InnoDB''')
 201
 202         #
 203         # This is the main table of data for stats.
 204         #
 205         # COLUMNS:
 206         #   'stat' refers to the stat field given in the stat table.
 207         #
 208         #   'x' referrs to the first dimension of a multi-dimensional stat. For
 209         #       a vector, x will start at 0 and increase for each vector
 210         #       element.
 211         #       For a distribution:
 212         #       -1: sum (for calculating standard deviation)
 213         #       -2: sum of squares (for calculating standard deviation)
 214         #       -3: total number of samples taken (for calculating
 215         #           standard deviation)
 216         #       -4: minimum value
 217         #       -5: maximum value
 218         #       -6: underflow
 219         #       -7: overflow
 220         #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
 221         #       dimension.
 222         #   'run' is the run that the data was generated from.  Details up in
 223         #       the run table
 224         #   'tick' is a timestamp generated by the simulator.
 225         #   'bin' is the name of the bin that the data was generated in, if
 226         #       any.
 227         #   'data' is the actual stat value.
 228         #
 229         # INDEXES:
 230         #   'stat' is indexed so that a user can find all of the data for a
 231         #       particular stat. It is not unique, because that specific stat
 232         #       can be found in many runs, bins, and samples, in addition to
 233         #       having entries for the mulidimensional cases.
 234         #   'run' is indexed to allow a user to remove all of the data for a
 235         #       particular execution run.  It can also be used to allow the
 236         #       user to print out all of the data for a given run.
 237         #
 238         self.query('''
 239         CREATE TABLE data(
 240             dt_stat     SMALLINT UNSIGNED       NOT NULL,
 241             dt_x        SMALLINT                NOT NULL,
 242             dt_y        SMALLINT                NOT NULL,
 243             dt_run      SMALLINT UNSIGNED       NOT NULL,
 244             dt_tick     BIGINT UNSIGNED         NOT NULL,
 245             dt_bin      SMALLINT UNSIGNED       NOT NULL,
 246             dt_data     DOUBLE                  NOT NULL,
 247             INDEX (dt_stat),
 248             INDEX (dt_run),
 249             UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin)
 250         ) TYPE=InnoDB;''')
 251
 252         #
 253         # Names and descriptions for multi-dimensional stats (vectors, etc.)
 254         # are stored here instead of having their own entry in the statistics
 255         # table. This allows all parts of a single stat to easily share a
 256         # single id.
 257         #
 258         # COLUMNS:
 259         #   'stat' is the unique stat identifier from the stat table.
 260         #   'x' is the first dimension for multi-dimensional stats
 261         #       corresponding to the data table above.
 262         #   'y' is the second dimension for multi-dimensional stats
 263         #       corresponding to the data table above.
 264         #   'name' is the specific subname for the unique stat,x,y combination.
 265         #   'descr' is the specific description for the uniqe stat,x,y
 266         #        combination.
 267         #
 268         # INDEXES:
 269         #   'stat' is indexed so you can get the subdata for a specific stat.
 270         #
 271         self.query('''
 272         CREATE TABLE subdata(
 273             sd_stat     SMALLINT UNSIGNED       NOT NULL,
 274             sd_x        SMALLINT                NOT NULL,
 275             sd_y        SMALLINT                NOT NULL,
 276             sd_name     VARCHAR(255)            NOT NULL,
 277             sd_descr    TEXT,
 278             UNIQUE (sd_stat,sd_x,sd_y)
 279         ) TYPE=InnoDB''')
 280
 281
 282         #
 283         # The formula table is maintained separately from the data table
 284         # because formula data, unlike other stat data cannot be represented
 285         # there.
 286         #
 287         # COLUMNS:
 288         #   'stat' refers to the stat field generated in the stat table.
 289         #   'formula' is the actual string representation of the formula
 290         #       itself.
 291         #
 292         # INDEXES:
 293         #   'stat' is indexed so that you can just look up a formula.
 294         #
 295         self.query('''
 296         CREATE TABLE formulas(
 297             fm_stat     SMALLINT UNSIGNED       NOT NULL,
 298             fm_formula  BLOB                    NOT NULL,
 299             PRIMARY KEY(fm_stat)
 300         ) TYPE=InnoDB''')
 301
 302         #
 303         # Each stat used in each formula is kept in this table.  This way, if
 304         # you want to print out a particular formula, you can simply find out
 305         # which stats you need by looking in this table.  Additionally, when
 306         # you remove a stat from the stats table and data table, you remove
 307         # any references to the formula in this table.  When a formula is no
 308         # longer referred to, you remove its entry.
 309         #
 310         # COLUMNS:
 311         #   'stat' is the stat id from the stat table above.
 312         #   'child' is the stat id of a stat that is used for this formula.
 313         #       There may be many children for any given 'stat' (formula)
 314         #
 315         # INDEXES:
 316         #   'stat' is indexed so you can look up all of the children for a
 317         #       particular stat.
 318         #   'child' is indexed so that you can remove an entry when a stat is
 319         #       removed.
 320         #
 321         self.query('''
 322         CREATE TABLE formula_ref(
 323             fr_stat     SMALLINT UNSIGNED       NOT NULL,
 324             fr_run      SMALLINT UNSIGNED       NOT NULL,
 325             UNIQUE (fr_stat,fr_run),
 326             INDEX (fr_stat),
 327             INDEX (fr_run)
 328         ) TYPE=InnoDB''')
 329
 330         # COLUMNS:
 331         #   'event' is the unique event id from the event_desc table
 332         #   'run' is simulation run id that this event took place in
 333         #   'tick' is the tick when the event happened
 334         #
 335         # INDEXES:
 336         #   'event' is indexed so you can look up all occurences of a
 337         #       specific event
 338         #   'run' is indexed so you can find all events in a run
 339         #   'tick' is indexed because we want the unique thing anyway
 340         #   'event,run,tick' is unique combination
 341         self.query('''
 342         CREATE TABLE events(
 343             ev_event    SMALLINT UNSIGNED       NOT NULL,
 344             ev_run      SMALLINT UNSIGNED       NOT NULL,
 345             ev_tick     BIGINT   UNSIGNED       NOT NULL,
 346             INDEX(ev_event),
 347             INDEX(ev_run),
 348             INDEX(ev_tick),
 349             UNIQUE(ev_event,ev_run,ev_tick)
 350         ) TYPE=InnoDB''')
 351
 352         # COLUMNS:
 353         #   'id' is the unique description id
 354         #   'name' is the name of the event that occurred
 355         #
 356         # INDEXES:
 357         #   'id' is indexed because it is the primary key and is what you use
 358         #       to look up the descriptions
 359         #   'name' is indexed so one can find the event based on name
 360         #
 361         self.query('''
 362         CREATE TABLE event_names(
 363             en_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 364             en_name     VARCHAR(255)            NOT NULL,
 365             PRIMARY KEY (en_id),
 366             UNIQUE (en_name)
 367         ) TYPE=InnoDB''')
 368
 369     def clean(self):
 370         self.query('''
 371         DELETE data
 372         FROM data
 373         LEFT JOIN runs ON dt_run=rn_id
 374         WHERE rn_id IS NULL''')
 375
 376         self.query('''
 377         DELETE formula_ref
 378         FROM formula_ref
 379         LEFT JOIN runs ON fr_run=rn_id
 380         WHERE rn_id IS NULL''')
 381
 382         self.query('''
 383         DELETE formulas
 384         FROM formulas
 385         LEFT JOIN formula_ref ON fm_stat=fr_stat
 386         WHERE fr_stat IS NULL''')
 387
 388         self.query('''
 389         DELETE stats
 390         FROM stats
 391         LEFT JOIN data ON st_id=dt_stat
 392         WHERE dt_stat IS NULL''')
 393
 394         self.query('''
 395         DELETE subdata
 396         FROM subdata
 397         LEFT JOIN data ON sd_stat=dt_stat
 398         WHERE dt_stat IS NULL''')
 399
 400         self.query('''
 401         DELETE bins
 402         FROM bins
 403         LEFT JOIN data ON bn_id=dt_bin
 404         WHERE dt_bin IS NULL''')
 405
 406         self.query('''
 407         DELETE events
 408         FROM events
 409         LEFT JOIN runs ON ev_run=rn_id
 410         WHERE rn_id IS NULL''')
 411
 412         self.query('''
 413         DELETE event_names
 414         FROM event_names
 415         LEFT JOIN events ON en_id=ev_event
 416         WHERE ev_event IS NULL''')