util/stats/dbinit.py

   1 # Copyright (c) 2003-2004 The Regents of The University of Michigan
   2 # All rights reserved.
   3 #
   4 # Redistribution and use in source and binary forms, with or without
   5 # modification, are permitted provided that the following conditions are
   6 # met: redistributions of source code must retain the above copyright
   7 # notice, this list of conditions and the following disclaimer;
   8 # redistributions in binary form must reproduce the above copyright
   9 # notice, this list of conditions and the following disclaimer in the
  10 # documentation and/or other materials provided with the distribution;
  11 # neither the name of the copyright holders nor the names of its
  12 # contributors may be used to endorse or promote products derived from
  13 # this software without specific prior written permission.
  14 #
  15 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  18 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  19 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  20 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  21 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 #
  27 # Authors: Nathan Binkert
  28
  29 import MySQLdb
  30
  31 class MyDB(object):
  32     def __init__(self, options):
  33         self.name = options.db
  34         self.host = options.host
  35         self.user = options.user
  36         self.passwd = options.passwd
  37         self.mydb = None
  38         self.cursor = None
  39
  40     def admin(self):
  41         self.close()
  42         self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
  43                                     passwd=self.passwd)
  44         self.cursor = self.mydb.cursor()
  45
  46     def connect(self):
  47         self.close()
  48         self.mydb = MySQLdb.connect(db=self.name, host=self.host,
  49                                     user=self.user, passwd=self.passwd)
  50         self.cursor = self.mydb.cursor()
  51
  52     def close(self):
  53         if self.mydb is not None:
  54             self.mydb.close()
  55         self.cursor = None
  56
  57     def query(self, sql):
  58         self.cursor.execute(sql)
  59
  60     def drop(self):
  61         self.query('DROP DATABASE IF EXISTS %s' % self.name)
  62
  63     def create(self):
  64         self.query('CREATE DATABASE %s' % self.name)
  65
  66     def populate(self):
  67         #
  68         # Each run (or simulation) gets its own entry in the runs table to
  69         # group stats by where they were generated
  70         #
  71         # COLUMNS:
  72         #   'id' is a unique identifier for each run to be used in other
  73         #       tables.
  74         #   'name' is the user designated name for the data generated.  It is
  75         #       configured in the simulator.
  76         #   'user' identifies the user that generated the data for the given
  77         #       run.
  78         #   'project' another name to identify runs for a specific goal
  79         #   'date' is a timestamp for when the data was generated.  It can be
  80         #       used to easily expire data that was generated in the past.
  81         #   'expire' is a timestamp for when the data should be removed from
  82         #       the database so we don't have years worth of junk.
  83         #
  84         # INDEXES:
  85         #   'run' is indexed so you can find out details of a run if the run
  86         #       was retreived from the data table.
  87         #   'name' is indexed so that two all run names are forced to be unique
  88         #
  89         self.query('''
  90         CREATE TABLE runs(
  91             rn_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
  92             rn_name     VARCHAR(200)            NOT NULL,
  93             rn_sample   VARCHAR(32)             NOT NULL,
  94             rn_user     VARCHAR(32)             NOT NULL,
  95             rn_project  VARCHAR(100)            NOT NULL,
  96             rn_date     TIMESTAMP               NOT NULL,
  97             rn_expire   TIMESTAMP               NOT NULL,
  98             PRIMARY KEY (rn_id),
  99             UNIQUE (rn_name,rn_sample)
 100         ) TYPE=InnoDB''')
 101
 102         #
 103         # The stat table gives us all of the data for a particular stat.
 104         #
 105         # COLUMNS:
 106         #   'stat' is a unique identifier for each stat to be used in other
 107         #       tables for references.
 108         #   'name' is simply the simulator derived name for a given
 109         #       statistic.
 110         #   'descr' is the description of the statistic and what it tells
 111         #       you.
 112         #   'type' defines what the stat tells you.  Types are:
 113         #       SCALAR: A simple scalar statistic that holds one value
 114         #       VECTOR: An array of statistic values.  Such a something that
 115         #           is generated per-thread.  Vectors exist to give averages,
 116         #            pdfs, cdfs, means, standard deviations, etc across the
 117         #           stat values.
 118         #       DIST: Is a distribution of data.  When the statistic value is
 119         #            sampled, its value is counted in a particular bucket.
 120         #           Useful for keeping track of utilization of a resource.
 121         #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
 122         #       VECTORDIST: Can be used when the distribution needs to be
 123         #            factored out into a per-thread distribution of data for
 124         #            example.  It can still be summed across threads to find
 125         #           the total distribution.
 126         #       VECTOR2D: Can be used when you have a stat that is not only
 127         #           per-thread, but it is per-something else.  Like
 128         #           per-message type.
 129         #       FORMULA: This statistic is a formula, and its data must be
 130         #            looked up in the formula table, for indicating how to
 131         #           present its values.
 132         #   'subdata' is potentially used by any of the vector types to
 133         #       give a specific name to all of the data elements within a
 134         #       stat.
 135         #   'print' indicates whether this stat should be printed ever.
 136         #       (Unnamed stats don't usually get printed)
 137         #   'prereq' only print the stat if the prereq is not zero.
 138         #   'prec' number of decimal places to print
 139         #   'nozero' don't print zero values
 140         #   'nonan' don't print NaN values
 141         #   'total' for vector type stats, print the total.
 142         #   'pdf' for vector type stats, print the pdf.
 143         #   'cdf' for vector type stats, print the cdf.
 144         #
 145         #   The Following are for dist type stats:
 146         #   'min' is the minimum bucket value. Anything less is an underflow.
 147         #   'max' is the maximum bucket value. Anything more is an overflow.
 148         #   'bktsize' is the approximate number of entries in each bucket.
 149         #   'size' is the number of buckets. equal to (min/max)/bktsize.
 150         #
 151         # INDEXES:
 152         #   'stat' is indexed so that you can find out details about a stat
 153         #       if the stat id was retrieved from the data table.
 154         #   'name' is indexed so that you can simply look up data about a
 155         #       named stat.
 156         #
 157         self.query('''
 158         CREATE TABLE stats(
 159             st_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 160             st_name     VARCHAR(255)            NOT NULL,
 161             st_descr    TEXT                    NOT NULL,
 162             st_type     ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
 163                 "VECTOR2D", "FORMULA")  NOT NULL,
 164             st_print    BOOL                    NOT NULL,
 165             st_prereq   SMALLINT UNSIGNED       NOT NULL,
 166             st_prec     TINYINT                 NOT NULL,
 167             st_nozero   BOOL                    NOT NULL,
 168             st_nonan    BOOL                    NOT NULL,
 169             st_total    BOOL                    NOT NULL,
 170             st_pdf      BOOL                    NOT NULL,
 171             st_cdf      BOOL                    NOT NULL,
 172             st_min      DOUBLE                  NOT NULL,
 173             st_max      DOUBLE                  NOT NULL,
 174             st_bktsize  DOUBLE                  NOT NULL,
 175             st_size     SMALLINT UNSIGNED       NOT NULL,
 176             PRIMARY KEY (st_id),
 177             UNIQUE (st_name)
 178         ) TYPE=InnoDB''')
 179
 180         #
 181         # This is the main table of data for stats.
 182         #
 183         # COLUMNS:
 184         #   'stat' refers to the stat field given in the stat table.
 185         #
 186         #   'x' referrs to the first dimension of a multi-dimensional stat. For
 187         #       a vector, x will start at 0 and increase for each vector
 188         #       element.
 189         #       For a distribution:
 190         #       -1: sum (for calculating standard deviation)
 191         #       -2: sum of squares (for calculating standard deviation)
 192         #       -3: total number of samples taken (for calculating
 193         #           standard deviation)
 194         #       -4: minimum value
 195         #       -5: maximum value
 196         #       -6: underflow
 197         #       -7: overflow
 198         #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
 199         #       dimension.
 200         #   'run' is the run that the data was generated from.  Details up in
 201         #       the run table
 202         #   'tick' is a timestamp generated by the simulator.
 203         #   'data' is the actual stat value.
 204         #
 205         # INDEXES:
 206         #   'stat' is indexed so that a user can find all of the data for a
 207         #       particular stat. It is not unique, because that specific stat
 208         #       can be found in many runs and samples, in addition to
 209         #       having entries for the mulidimensional cases.
 210         #   'run' is indexed to allow a user to remove all of the data for a
 211         #       particular execution run.  It can also be used to allow the
 212         #       user to print out all of the data for a given run.
 213         #
 214         self.query('''
 215         CREATE TABLE data(
 216             dt_stat     SMALLINT UNSIGNED       NOT NULL,
 217             dt_x        SMALLINT                NOT NULL,
 218             dt_y        SMALLINT                NOT NULL,
 219             dt_run      SMALLINT UNSIGNED       NOT NULL,
 220             dt_tick     BIGINT UNSIGNED         NOT NULL,
 221             dt_data     DOUBLE                  NOT NULL,
 222             INDEX (dt_stat),
 223             INDEX (dt_run),
 224             UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick)
 225         ) TYPE=InnoDB;''')
 226
 227         #
 228         # Names and descriptions for multi-dimensional stats (vectors, etc.)
 229         # are stored here instead of having their own entry in the statistics
 230         # table. This allows all parts of a single stat to easily share a
 231         # single id.
 232         #
 233         # COLUMNS:
 234         #   'stat' is the unique stat identifier from the stat table.
 235         #   'x' is the first dimension for multi-dimensional stats
 236         #       corresponding to the data table above.
 237         #   'y' is the second dimension for multi-dimensional stats
 238         #       corresponding to the data table above.
 239         #   'name' is the specific subname for the unique stat,x,y combination.
 240         #   'descr' is the specific description for the uniqe stat,x,y
 241         #        combination.
 242         #
 243         # INDEXES:
 244         #   'stat' is indexed so you can get the subdata for a specific stat.
 245         #
 246         self.query('''
 247         CREATE TABLE subdata(
 248             sd_stat     SMALLINT UNSIGNED       NOT NULL,
 249             sd_x        SMALLINT                NOT NULL,
 250             sd_y        SMALLINT                NOT NULL,
 251             sd_name     VARCHAR(255)            NOT NULL,
 252             sd_descr    TEXT,
 253             UNIQUE (sd_stat,sd_x,sd_y)
 254         ) TYPE=InnoDB''')
 255
 256
 257         #
 258         # The formula table is maintained separately from the data table
 259         # because formula data, unlike other stat data cannot be represented
 260         # there.
 261         #
 262         # COLUMNS:
 263         #   'stat' refers to the stat field generated in the stat table.
 264         #   'formula' is the actual string representation of the formula
 265         #       itself.
 266         #
 267         # INDEXES:
 268         #   'stat' is indexed so that you can just look up a formula.
 269         #
 270         self.query('''
 271         CREATE TABLE formulas(
 272             fm_stat     SMALLINT UNSIGNED       NOT NULL,
 273             fm_formula  BLOB                    NOT NULL,
 274             PRIMARY KEY(fm_stat)
 275         ) TYPE=InnoDB''')
 276
 277         #
 278         # Each stat used in each formula is kept in this table.  This way, if
 279         # you want to print out a particular formula, you can simply find out
 280         # which stats you need by looking in this table.  Additionally, when
 281         # you remove a stat from the stats table and data table, you remove
 282         # any references to the formula in this table.  When a formula is no
 283         # longer referred to, you remove its entry.
 284         #
 285         # COLUMNS:
 286         #   'stat' is the stat id from the stat table above.
 287         #   'child' is the stat id of a stat that is used for this formula.
 288         #       There may be many children for any given 'stat' (formula)
 289         #
 290         # INDEXES:
 291         #   'stat' is indexed so you can look up all of the children for a
 292         #       particular stat.
 293         #   'child' is indexed so that you can remove an entry when a stat is
 294         #       removed.
 295         #
 296         self.query('''
 297         CREATE TABLE formula_ref(
 298             fr_stat     SMALLINT UNSIGNED       NOT NULL,
 299             fr_run      SMALLINT UNSIGNED       NOT NULL,
 300             UNIQUE (fr_stat,fr_run),
 301             INDEX (fr_stat),
 302             INDEX (fr_run)
 303         ) TYPE=InnoDB''')
 304
 305         # COLUMNS:
 306         #   'event' is the unique event id from the event_desc table
 307         #   'run' is simulation run id that this event took place in
 308         #   'tick' is the tick when the event happened
 309         #
 310         # INDEXES:
 311         #   'event' is indexed so you can look up all occurences of a
 312         #       specific event
 313         #   'run' is indexed so you can find all events in a run
 314         #   'tick' is indexed because we want the unique thing anyway
 315         #   'event,run,tick' is unique combination
 316         self.query('''
 317         CREATE TABLE events(
 318             ev_event    SMALLINT UNSIGNED       NOT NULL,
 319             ev_run      SMALLINT UNSIGNED       NOT NULL,
 320             ev_tick     BIGINT   UNSIGNED       NOT NULL,
 321             INDEX(ev_event),
 322             INDEX(ev_run),
 323             INDEX(ev_tick),
 324             UNIQUE(ev_event,ev_run,ev_tick)
 325         ) TYPE=InnoDB''')
 326
 327         # COLUMNS:
 328         #   'id' is the unique description id
 329         #   'name' is the name of the event that occurred
 330         #
 331         # INDEXES:
 332         #   'id' is indexed because it is the primary key and is what you use
 333         #       to look up the descriptions
 334         #   'name' is indexed so one can find the event based on name
 335         #
 336         self.query('''
 337         CREATE TABLE event_names(
 338             en_id       SMALLINT UNSIGNED       NOT NULL AUTO_INCREMENT,
 339             en_name     VARCHAR(255)            NOT NULL,
 340             PRIMARY KEY (en_id),
 341             UNIQUE (en_name)
 342         ) TYPE=InnoDB''')
 343
 344     def clean(self):
 345         self.query('''
 346         DELETE data
 347         FROM data
 348         LEFT JOIN runs ON dt_run=rn_id
 349         WHERE rn_id IS NULL''')
 350
 351         self.query('''
 352         DELETE formula_ref
 353         FROM formula_ref
 354         LEFT JOIN runs ON fr_run=rn_id
 355         WHERE rn_id IS NULL''')
 356
 357         self.query('''
 358         DELETE formulas
 359         FROM formulas
 360         LEFT JOIN formula_ref ON fm_stat=fr_stat
 361         WHERE fr_stat IS NULL''')
 362
 363         self.query('''
 364         DELETE stats
 365         FROM stats
 366         LEFT JOIN data ON st_id=dt_stat
 367         WHERE dt_stat IS NULL''')
 368
 369         self.query('''
 370         DELETE subdata
 371         FROM subdata
 372         LEFT JOIN data ON sd_stat=dt_stat
 373         WHERE dt_stat IS NULL''')
 374
 375         self.query('''
 376         DELETE events
 377         FROM events
 378         LEFT JOIN runs ON ev_run=rn_id
 379         WHERE rn_id IS NULL''')
 380
 381         self.query('''
 382         DELETE event_names
 383         FROM event_names
 384         LEFT JOIN events ON en_id=ev_event
 385         WHERE ev_event IS NULL''')