Merge iceaxe.:/Volumes/work/research/m5/head
[gem5.git] / util / stats / dbinit.py
1 # Copyright (c) 2003-2004 The Regents of The University of Michigan
2 # All rights reserved.
3 #
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions are
6 # met: redistributions of source code must retain the above copyright
7 # notice, this list of conditions and the following disclaimer;
8 # redistributions in binary form must reproduce the above copyright
9 # notice, this list of conditions and the following disclaimer in the
10 # documentation and/or other materials provided with the distribution;
11 # neither the name of the copyright holders nor the names of its
12 # contributors may be used to endorse or promote products derived from
13 # this software without specific prior written permission.
14 #
15 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 #
27 # Authors: Nathan Binkert
28
29 import MySQLdb
30
31 class MyDB(object):
32 def __init__(self, options):
33 self.name = options.db
34 self.host = options.host
35 self.user = options.user
36 self.passwd = options.passwd
37 self.mydb = None
38 self.cursor = None
39
40 def admin(self):
41 self.close()
42 self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
43 passwd=self.passwd)
44 self.cursor = self.mydb.cursor()
45
46 def connect(self):
47 self.close()
48 self.mydb = MySQLdb.connect(db=self.name, host=self.host,
49 user=self.user, passwd=self.passwd)
50 self.cursor = self.mydb.cursor()
51
52 def close(self):
53 if self.mydb is not None:
54 self.mydb.close()
55 self.cursor = None
56
57 def query(self, sql):
58 self.cursor.execute(sql)
59
60 def drop(self):
61 self.query('DROP DATABASE IF EXISTS %s' % self.name)
62
63 def create(self):
64 self.query('CREATE DATABASE %s' % self.name)
65
66 def populate(self):
67 #
68 # Each run (or simulation) gets its own entry in the runs table to
69 # group stats by where they were generated
70 #
71 # COLUMNS:
72 # 'id' is a unique identifier for each run to be used in other
73 # tables.
74 # 'name' is the user designated name for the data generated. It is
75 # configured in the simulator.
76 # 'user' identifies the user that generated the data for the given
77 # run.
78 # 'project' another name to identify runs for a specific goal
79 # 'date' is a timestamp for when the data was generated. It can be
80 # used to easily expire data that was generated in the past.
81 # 'expire' is a timestamp for when the data should be removed from
82 # the database so we don't have years worth of junk.
83 #
84 # INDEXES:
85 # 'run' is indexed so you can find out details of a run if the run
86 # was retreived from the data table.
87 # 'name' is indexed so that two all run names are forced to be unique
88 #
89 self.query('''
90 CREATE TABLE runs(
91 rn_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT,
92 rn_name VARCHAR(200) NOT NULL,
93 rn_sample VARCHAR(32) NOT NULL,
94 rn_user VARCHAR(32) NOT NULL,
95 rn_project VARCHAR(100) NOT NULL,
96 rn_date TIMESTAMP NOT NULL,
97 rn_expire TIMESTAMP NOT NULL,
98 PRIMARY KEY (rn_id),
99 UNIQUE (rn_name,rn_sample)
100 ) TYPE=InnoDB''')
101
102 #
103 # The stat table gives us all of the data for a particular stat.
104 #
105 # COLUMNS:
106 # 'stat' is a unique identifier for each stat to be used in other
107 # tables for references.
108 # 'name' is simply the simulator derived name for a given
109 # statistic.
110 # 'descr' is the description of the statistic and what it tells
111 # you.
112 # 'type' defines what the stat tells you. Types are:
113 # SCALAR: A simple scalar statistic that holds one value
114 # VECTOR: An array of statistic values. Such a something that
115 # is generated per-thread. Vectors exist to give averages,
116 # pdfs, cdfs, means, standard deviations, etc across the
117 # stat values.
118 # DIST: Is a distribution of data. When the statistic value is
119 # sampled, its value is counted in a particular bucket.
120 # Useful for keeping track of utilization of a resource.
121 # (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
122 # VECTORDIST: Can be used when the distribution needs to be
123 # factored out into a per-thread distribution of data for
124 # example. It can still be summed across threads to find
125 # the total distribution.
126 # VECTOR2D: Can be used when you have a stat that is not only
127 # per-thread, but it is per-something else. Like
128 # per-message type.
129 # FORMULA: This statistic is a formula, and its data must be
130 # looked up in the formula table, for indicating how to
131 # present its values.
132 # 'subdata' is potentially used by any of the vector types to
133 # give a specific name to all of the data elements within a
134 # stat.
135 # 'print' indicates whether this stat should be printed ever.
136 # (Unnamed stats don't usually get printed)
137 # 'prereq' only print the stat if the prereq is not zero.
138 # 'prec' number of decimal places to print
139 # 'nozero' don't print zero values
140 # 'nonan' don't print NaN values
141 # 'total' for vector type stats, print the total.
142 # 'pdf' for vector type stats, print the pdf.
143 # 'cdf' for vector type stats, print the cdf.
144 #
145 # The Following are for dist type stats:
146 # 'min' is the minimum bucket value. Anything less is an underflow.
147 # 'max' is the maximum bucket value. Anything more is an overflow.
148 # 'bktsize' is the approximate number of entries in each bucket.
149 # 'size' is the number of buckets. equal to (min/max)/bktsize.
150 #
151 # INDEXES:
152 # 'stat' is indexed so that you can find out details about a stat
153 # if the stat id was retrieved from the data table.
154 # 'name' is indexed so that you can simply look up data about a
155 # named stat.
156 #
157 self.query('''
158 CREATE TABLE stats(
159 st_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT,
160 st_name VARCHAR(255) NOT NULL,
161 st_descr TEXT NOT NULL,
162 st_type ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
163 "VECTOR2D", "FORMULA") NOT NULL,
164 st_print BOOL NOT NULL,
165 st_prereq SMALLINT UNSIGNED NOT NULL,
166 st_prec TINYINT NOT NULL,
167 st_nozero BOOL NOT NULL,
168 st_nonan BOOL NOT NULL,
169 st_total BOOL NOT NULL,
170 st_pdf BOOL NOT NULL,
171 st_cdf BOOL NOT NULL,
172 st_min DOUBLE NOT NULL,
173 st_max DOUBLE NOT NULL,
174 st_bktsize DOUBLE NOT NULL,
175 st_size SMALLINT UNSIGNED NOT NULL,
176 PRIMARY KEY (st_id),
177 UNIQUE (st_name)
178 ) TYPE=InnoDB''')
179
180 #
181 # This is the main table of data for stats.
182 #
183 # COLUMNS:
184 # 'stat' refers to the stat field given in the stat table.
185 #
186 # 'x' referrs to the first dimension of a multi-dimensional stat. For
187 # a vector, x will start at 0 and increase for each vector
188 # element.
189 # For a distribution:
190 # -1: sum (for calculating standard deviation)
191 # -2: sum of squares (for calculating standard deviation)
192 # -3: total number of samples taken (for calculating
193 # standard deviation)
194 # -4: minimum value
195 # -5: maximum value
196 # -6: underflow
197 # -7: overflow
198 # 'y' is used by a VECTORDIST and the VECTOR2D to describe the second
199 # dimension.
200 # 'run' is the run that the data was generated from. Details up in
201 # the run table
202 # 'tick' is a timestamp generated by the simulator.
203 # 'data' is the actual stat value.
204 #
205 # INDEXES:
206 # 'stat' is indexed so that a user can find all of the data for a
207 # particular stat. It is not unique, because that specific stat
208 # can be found in many runs and samples, in addition to
209 # having entries for the mulidimensional cases.
210 # 'run' is indexed to allow a user to remove all of the data for a
211 # particular execution run. It can also be used to allow the
212 # user to print out all of the data for a given run.
213 #
214 self.query('''
215 CREATE TABLE data(
216 dt_stat SMALLINT UNSIGNED NOT NULL,
217 dt_x SMALLINT NOT NULL,
218 dt_y SMALLINT NOT NULL,
219 dt_run SMALLINT UNSIGNED NOT NULL,
220 dt_tick BIGINT UNSIGNED NOT NULL,
221 dt_data DOUBLE NOT NULL,
222 INDEX (dt_stat),
223 INDEX (dt_run),
224 UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick)
225 ) TYPE=InnoDB;''')
226
227 #
228 # Names and descriptions for multi-dimensional stats (vectors, etc.)
229 # are stored here instead of having their own entry in the statistics
230 # table. This allows all parts of a single stat to easily share a
231 # single id.
232 #
233 # COLUMNS:
234 # 'stat' is the unique stat identifier from the stat table.
235 # 'x' is the first dimension for multi-dimensional stats
236 # corresponding to the data table above.
237 # 'y' is the second dimension for multi-dimensional stats
238 # corresponding to the data table above.
239 # 'name' is the specific subname for the unique stat,x,y combination.
240 # 'descr' is the specific description for the uniqe stat,x,y
241 # combination.
242 #
243 # INDEXES:
244 # 'stat' is indexed so you can get the subdata for a specific stat.
245 #
246 self.query('''
247 CREATE TABLE subdata(
248 sd_stat SMALLINT UNSIGNED NOT NULL,
249 sd_x SMALLINT NOT NULL,
250 sd_y SMALLINT NOT NULL,
251 sd_name VARCHAR(255) NOT NULL,
252 sd_descr TEXT,
253 UNIQUE (sd_stat,sd_x,sd_y)
254 ) TYPE=InnoDB''')
255
256
257 #
258 # The formula table is maintained separately from the data table
259 # because formula data, unlike other stat data cannot be represented
260 # there.
261 #
262 # COLUMNS:
263 # 'stat' refers to the stat field generated in the stat table.
264 # 'formula' is the actual string representation of the formula
265 # itself.
266 #
267 # INDEXES:
268 # 'stat' is indexed so that you can just look up a formula.
269 #
270 self.query('''
271 CREATE TABLE formulas(
272 fm_stat SMALLINT UNSIGNED NOT NULL,
273 fm_formula BLOB NOT NULL,
274 PRIMARY KEY(fm_stat)
275 ) TYPE=InnoDB''')
276
277 #
278 # Each stat used in each formula is kept in this table. This way, if
279 # you want to print out a particular formula, you can simply find out
280 # which stats you need by looking in this table. Additionally, when
281 # you remove a stat from the stats table and data table, you remove
282 # any references to the formula in this table. When a formula is no
283 # longer referred to, you remove its entry.
284 #
285 # COLUMNS:
286 # 'stat' is the stat id from the stat table above.
287 # 'child' is the stat id of a stat that is used for this formula.
288 # There may be many children for any given 'stat' (formula)
289 #
290 # INDEXES:
291 # 'stat' is indexed so you can look up all of the children for a
292 # particular stat.
293 # 'child' is indexed so that you can remove an entry when a stat is
294 # removed.
295 #
296 self.query('''
297 CREATE TABLE formula_ref(
298 fr_stat SMALLINT UNSIGNED NOT NULL,
299 fr_run SMALLINT UNSIGNED NOT NULL,
300 UNIQUE (fr_stat,fr_run),
301 INDEX (fr_stat),
302 INDEX (fr_run)
303 ) TYPE=InnoDB''')
304
305 # COLUMNS:
306 # 'event' is the unique event id from the event_desc table
307 # 'run' is simulation run id that this event took place in
308 # 'tick' is the tick when the event happened
309 #
310 # INDEXES:
311 # 'event' is indexed so you can look up all occurences of a
312 # specific event
313 # 'run' is indexed so you can find all events in a run
314 # 'tick' is indexed because we want the unique thing anyway
315 # 'event,run,tick' is unique combination
316 self.query('''
317 CREATE TABLE events(
318 ev_event SMALLINT UNSIGNED NOT NULL,
319 ev_run SMALLINT UNSIGNED NOT NULL,
320 ev_tick BIGINT UNSIGNED NOT NULL,
321 INDEX(ev_event),
322 INDEX(ev_run),
323 INDEX(ev_tick),
324 UNIQUE(ev_event,ev_run,ev_tick)
325 ) TYPE=InnoDB''')
326
327 # COLUMNS:
328 # 'id' is the unique description id
329 # 'name' is the name of the event that occurred
330 #
331 # INDEXES:
332 # 'id' is indexed because it is the primary key and is what you use
333 # to look up the descriptions
334 # 'name' is indexed so one can find the event based on name
335 #
336 self.query('''
337 CREATE TABLE event_names(
338 en_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT,
339 en_name VARCHAR(255) NOT NULL,
340 PRIMARY KEY (en_id),
341 UNIQUE (en_name)
342 ) TYPE=InnoDB''')
343
344 def clean(self):
345 self.query('''
346 DELETE data
347 FROM data
348 LEFT JOIN runs ON dt_run=rn_id
349 WHERE rn_id IS NULL''')
350
351 self.query('''
352 DELETE formula_ref
353 FROM formula_ref
354 LEFT JOIN runs ON fr_run=rn_id
355 WHERE rn_id IS NULL''')
356
357 self.query('''
358 DELETE formulas
359 FROM formulas
360 LEFT JOIN formula_ref ON fm_stat=fr_stat
361 WHERE fr_stat IS NULL''')
362
363 self.query('''
364 DELETE stats
365 FROM stats
366 LEFT JOIN data ON st_id=dt_stat
367 WHERE dt_stat IS NULL''')
368
369 self.query('''
370 DELETE subdata
371 FROM subdata
372 LEFT JOIN data ON sd_stat=dt_stat
373 WHERE dt_stat IS NULL''')
374
375 self.query('''
376 DELETE events
377 FROM events
378 LEFT JOIN runs ON ev_run=rn_id
379 WHERE rn_id IS NULL''')
380
381 self.query('''
382 DELETE event_names
383 FROM event_names
384 LEFT JOIN events ON en_id=ev_event
385 WHERE ev_event IS NULL''')