arch-gcn3: make read2st64_b32 write proper registers
[gem5.git] / ext / mcpat / logic.cc
1 /*****************************************************************************
2 * McPAT
3 * SOFTWARE LICENSE AGREEMENT
4 * Copyright 2012 Hewlett-Packard Development Company, L.P.
5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
6 * All Rights Reserved
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met: redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer;
12 * redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution;
15 * neither the name of the copyright holders nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 ***************************************************************************/
32
33 #include "common.h"
34 #include "logic.h"
35
36 //selection_logic
37 selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default,
38 int _win_entries, int issue_width_,
39 const InputParameter *configure_interface,
40 string _name, double _accesses,
41 double clockRate_, enum Device_ty device_ty_,
42 enum Core_type core_ty_)
43 : McPATComponent(_xml_data), is_default(_is_default),
44 win_entries(_win_entries),
45 issue_width(issue_width_),
46 accesses(_accesses),
47 device_ty(device_ty_),
48 core_ty(core_ty_) {
49 clockRate = clockRate_;
50 name = _name;
51 l_ip = *configure_interface;
52 local_result = init_interface(&l_ip, name);
53 }
54
55 void selection_logic::computeArea() {
56 output_data.area = local_result.area;
57 }
58
59 void selection_logic::computeEnergy() {
60 //based on cost effective superscalar processor TR pp27-31
61 double Ctotal, Cor, Cpencode;
62 int num_arbiter;
63 double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
64
65 //the 0.8um process data is used.
66 //this was 10 micron for the 0.8 micron process
67 WSelORn = 12.5 * l_ip.F_sz_um;
68 //this was 40 micron for the 0.8 micron process
69 WSelORprequ = 50 * l_ip.F_sz_um;
70 //this was 10mcron for the 0.8 micron process
71 WSelPn = 12.5 * l_ip.F_sz_um;
72 //this was 15 micron for the 0.8 micron process
73 WSelPp = 18.75 * l_ip.F_sz_um;
74 //this was 5 micron for the 0.8 micron process
75 WSelEnn = 6.25 * l_ip.F_sz_um;
76 //this was 10 micron for the 0.8 micron process
77 WSelEnp = 12.5 * l_ip.F_sz_um;
78
79 Ctotal = 0;
80 num_arbiter = 1;
81 while (win_entries > 4) {
82 win_entries = (int)ceil((double)win_entries / 4.0);
83 num_arbiter += win_entries;
84 }
85 //the 4-input OR logic to generate anyreq
86 Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) +
87 drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def);
88 power.readOp.gate_leakage =
89 cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;
90
91 //The total capacity of the 4-bit priority encoder
92 Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
93 drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) +
94 2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
95 drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) +
96 3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
97 drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) +
98 4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
99 drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic
100 2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) +
101 4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) +
102 2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic
103 (2 * 4 + 2 * 3 + 2 * 2 + 2) *
104 gate_C(WSelPn + WSelPp, 10.0);//requests signal
105
106 Ctotal += issue_width * num_arbiter * (Cor + Cpencode);
107
108 //2 means the abitration signal need to travel round trip
109 power.readOp.dynamic =
110 Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2;
111 power.readOp.leakage = issue_width * num_arbiter *
112 (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
113 + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
114 + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
115 + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
116 + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
117 ) * g_tp.peri_global.Vdd;
118 power.readOp.gate_leakage = issue_width * num_arbiter *
119 (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
120 + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
121 + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
122 + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
123 + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
124 ) * g_tp.peri_global.Vdd;
125 double sckRation = g_tp.sckt_co_eff;
126 power.readOp.dynamic *= sckRation;
127 power.writeOp.dynamic *= sckRation;
128 power.searchOp.dynamic *= sckRation;
129
130 double long_channel_device_reduction =
131 longer_channel_device_reduction(device_ty, core_ty);
132 power.readOp.longer_channel_leakage =
133 power.readOp.leakage * long_channel_device_reduction;
134
135 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
136 output_data.subthreshold_leakage_power = power.readOp.leakage;
137 output_data.gate_leakage_power = power.readOp.gate_leakage;
138 output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses;
139 }
140
141 dep_resource_conflict_check::dep_resource_conflict_check(
142 XMLNode* _xml_data, const string _name,
143 const InputParameter *configure_interface,
144 const CoreParameters & dyn_p_, int compare_bits_,
145 double clockRate_, bool _is_default)
146 : McPATComponent(_xml_data), l_ip(*configure_interface),
147 coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) {
148
149 name = _name;
150 clockRate = clockRate_;
151 //this was 20.0 micron for the 0.8 micron process
152 Wcompn = 25 * l_ip.F_sz_um;
153 //this was 20.0 micron for the 0.8 micron process
154 Wevalinvp = 25 * l_ip.F_sz_um;
155 //this was 80.0 mcron for the 0.8 micron process
156 Wevalinvn = 100 * l_ip.F_sz_um;
157 //this was 40.0 micron for the 0.8 micron process
158 Wcomppreequ = 50 * l_ip.F_sz_um;
159 //this was 5.4 micron for the 0.8 micron process
160 WNORn = 6.75 * l_ip.F_sz_um;
161 //this was 30.5 micron for the 0.8 micron process
162 WNORp = 38.125 * l_ip.F_sz_um;
163
164 // To make CACTI happy.
165 l_ip.cache_sz = MIN_BUFFER_SIZE;
166 local_result = init_interface(&l_ip, name);
167
168 if (coredynp.core_ty == Inorder)
169 //TODO: opcode bits + log(shared resources) + REG TAG BITS -->
170 //opcode comparator
171 compare_bits += 16 + 8 + 8;
172 else
173 compare_bits += 16 + 8 + 8;
174
175 conflict_check_power();
176 double sckRation = g_tp.sckt_co_eff;
177 power.readOp.dynamic *= sckRation;
178 power.writeOp.dynamic *= sckRation;
179 power.searchOp.dynamic *= sckRation;
180
181 }
182
183 void dep_resource_conflict_check::conflict_check_power() {
184 double Ctotal;
185 int num_comparators;
186 //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for
187 //dest to dest comparision.
188 num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
189 coredynp.decodeW);
190
191 Ctotal = num_comparators * compare_cap();
192
193 power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd *
194 g_tp.peri_global.Vdd /*AF*/;
195 power.readOp.leakage = num_comparators * compare_bits * 2 *
196 simplified_nmos_leakage(Wcompn, false);
197
198 double long_channel_device_reduction =
199 longer_channel_device_reduction(Core_device, coredynp.core_ty);
200 power.readOp.longer_channel_leakage =
201 power.readOp.leakage * long_channel_device_reduction;
202 power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
203 cmos_Ig_leakage(Wcompn, 0, 2, nmos);
204
205 }
206
207 /* estimate comparator power consumption (this comparator is similar
208 to the tag-match structure in a CAM */
209 double dep_resource_conflict_check::compare_cap() {
210 double c1, c2;
211
212 //resize the big NOR gate at the DCL according to fan in.
213 WNORp = WNORp * compare_bits / 2.0;
214 /* bottom part of comparator */
215 c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
216 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) +
217 drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) +
218 drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);
219
220 /* top part of comparator */
221 c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
222 drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) +
223 drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) +
224 gate_C(WNORn + WNORp, 10.0) +
225 drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits *
226 drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def);
227 return(c1 + c2);
228
229 }
230
231 void dep_resource_conflict_check::leakage_feedback(double temperature)
232 {
233 l_ip.temp = (unsigned int)round(temperature/10.0)*10;
234 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
235
236 // This is part of conflict_check_power()
237 // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest
238 // to dest comparison.
239 int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
240 coredynp.decodeW);
241 power.readOp.leakage = num_comparators * compare_bits * 2 *
242 simplified_nmos_leakage(Wcompn, false);
243
244 double long_channel_device_reduction =
245 longer_channel_device_reduction(Core_device, coredynp.core_ty);
246 power.readOp.longer_channel_leakage = power.readOp.leakage *
247 long_channel_device_reduction;
248 power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
249 cmos_Ig_leakage(Wcompn, 0, 2, nmos);
250 }
251
252
253 DFFCell::DFFCell(
254 bool _is_dram,
255 double _WdecNANDn,
256 double _WdecNANDp,
257 double _cell_load,
258 const InputParameter *configure_interface)
259 : is_dram(_is_dram),
260 cell_load(_cell_load),
261 WdecNANDn(_WdecNANDn),
262 WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF.
263 l_ip = *configure_interface;
264 area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp,
265 g_tp.cell_h_def)
266 + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn,
267 g_tp.cell_h_def));
268
269
270 }
271
272
273 double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) {
274 double Ctotal = 0;
275
276 /* part 1: drain cap of NAND gate */
277 Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
278
279 /* part 2: gate cap of NAND gates */
280 Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
281
282 return Ctotal;
283 }
284
285
286 void DFFCell::compute_DFF_cell() {
287 double c1, c2, c3, c4, c5, c6;
288 /* node 5 and node 6 are identical to node 1 in capacitance */
289 c1 = c5 = c6 = fpfp_node_cap(2, 1);
290 c2 = fpfp_node_cap(2, 3);
291 c3 = fpfp_node_cap(3, 2);
292 c4 = fpfp_node_cap(2, 2);
293
294 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
295 clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
296 e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) *
297 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
298
299 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
300 e_keep_1.readOp.dynamic +=
301 c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
302 e_keep_0.readOp.dynamic +=
303 c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
304 e_clock.readOp.dynamic +=
305 clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
306
307 /* static power */
308 e_switch.readOp.leakage +=
309 (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) *
310 5//5 NAND2 and 1 NAND3 in a DFF
311 + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
312 g_tp.peri_global.Vdd;
313 e_switch.readOp.gate_leakage +=
314 (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) *
315 5//5 NAND2 and 1 NAND3 in a DFF
316 + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
317 g_tp.peri_global.Vdd;
318 }
319
320 Pipeline::Pipeline(XMLNode* _xml_data,
321 const InputParameter *configure_interface,
322 const CoreParameters & dyn_p_,
323 enum Device_ty device_ty_,
324 bool _is_core_pipeline,
325 bool _is_default)
326 : McPATComponent(_xml_data), l_ip(*configure_interface),
327 coredynp(dyn_p_), device_ty(device_ty_),
328 is_core_pipeline(_is_core_pipeline), is_default(_is_default),
329 num_piperegs(0.0) {
330 name = "Pipeline?";
331
332 local_result = init_interface(&l_ip, name);
333 if (!coredynp.Embedded) {
334 process_ind = true;
335 } else {
336 process_ind = false;
337 }
338 //this was 20 micron for the 0.8 micron process
339 WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;
340 //this was 30 micron for the 0.8 micron process
341 WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ *
342 pmos_to_nmos_sz_ratio();
343 load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false);
344 compute();
345
346 }
347
348 void Pipeline::compute() {
349 compute_stage_vector();
350 DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip);
351 pipe_reg.compute_DFF_cell();
352
353 double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
354 //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
355 //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
356 double pipe_reg_power = num_piperegs *
357 (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic +
358 pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg;
359 double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
360 double pipe_reg_gate_leakage = num_piperegs *
361 pipe_reg.e_switch.readOp.gate_leakage;
362 power.readOp.dynamic += pipe_reg_power;
363 power.readOp.leakage += pipe_reg_leakage;
364 power.readOp.gate_leakage += pipe_reg_gate_leakage;
365 area.set_area(num_piperegs * pipe_reg.area.get_area());
366
367 double long_channel_device_reduction =
368 longer_channel_device_reduction(device_ty, coredynp.core_ty);
369 power.readOp.longer_channel_leakage = power.readOp.leakage *
370 long_channel_device_reduction;
371
372
373 double sckRation = g_tp.sckt_co_eff;
374 power.readOp.dynamic *= sckRation;
375 power.writeOp.dynamic *= sckRation;
376 power.searchOp.dynamic *= sckRation;
377 double macro_layout_overhead = g_tp.macro_layout_overhead;
378 if (!coredynp.Embedded)
379 area.set_area(area.get_area() * macro_layout_overhead);
380
381 output_data.area = area.get_area() / 1e6;
382 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
383 output_data.subthreshold_leakage_power = power.readOp.leakage;
384 output_data.gate_leakage_power = power.readOp.gate_leakage;
385 output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles;
386 }
387
388 void Pipeline::compute_stage_vector() {
389 double num_stages, tot_stage_vector, per_stage_vector;
390 int opcode_length = coredynp.x86 ?
391 coredynp.micro_opcode_length : coredynp.opcode_width;
392
393 if (!is_core_pipeline) {
394 //The number of pipeline stages are calculated based on the achievable
395 //throughput and required throughput
396 num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector;
397 } else {
398 if (coredynp.core_ty == Inorder) {
399 /* assume 6 pipe stages and try to estimate bits per pipe stage */
400 /* pipe stage 0/IF */
401 num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads;
402 /* pipe stage IF/ID */
403 num_piperegs += coredynp.fetchW *
404 (coredynp.instruction_length + coredynp.pc_width) *
405 coredynp.num_hthreads;
406 /* pipe stage IF/ThreadSEL */
407 if (coredynp.multithreaded) {
408 num_piperegs += coredynp.num_hthreads *
409 coredynp.perThreadState; //8 bit thread states
410 }
411 /* pipe stage ID/EXE */
412 num_piperegs += coredynp.decodeW *
413 (coredynp.instruction_length + coredynp.pc_width +
414 pow(2.0, opcode_length) + 2 * coredynp.int_data_width) *
415 coredynp.num_hthreads;
416 /* pipe stage EXE/MEM */
417 num_piperegs += coredynp.issueW *
418 (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 *
419 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
420 /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
421 num_piperegs += coredynp.issueW *
422 (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 *
423 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
424 num_stages = 6;
425 } else {
426 /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
427 /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
428
429 /* pipe stage 0/1F*/
430 num_piperegs +=
431 coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC
432 /* pipe stage IF/ID */
433 num_piperegs += coredynp.fetchW *
434 (coredynp.instruction_length + coredynp.pc_width) *
435 coredynp.num_hthreads;//PC is used to feed branch predictor in ID
436 /* pipe stage 1D/Renaming*/
437 num_piperegs += coredynp.decodeW *
438 (coredynp.instruction_length + coredynp.pc_width) *
439 coredynp.num_hthreads;//PC is for branch exe in later stage.
440 /* pipe stage Renaming/wire_drive */
441 num_piperegs += coredynp.decodeW *
442 (coredynp.instruction_length + coredynp.pc_width);
443 /* pipe stage Renaming/IssueQ */
444 //3*coredynp.phy_ireg_width means 2 sources and 1 dest
445 num_piperegs += coredynp.issueW *
446 (coredynp.instruction_length + coredynp.pc_width + 3 *
447 coredynp.phy_ireg_width) * coredynp.num_hthreads;
448 /* pipe stage IssueQ/Dispatch */
449 num_piperegs += coredynp.issueW *
450 (coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
451 /* pipe stage Dispatch/EXE */
452
453 num_piperegs += coredynp.issueW *
454 (3 * coredynp.phy_ireg_width + coredynp.pc_width +
455 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
456 /* 2^opcode_length means the total decoded signal for the opcode*/
457 num_piperegs += coredynp.issueW *
458 (2 * coredynp.int_data_width + pow(2.0, opcode_length)
459 /*+2*powers (2,reg_length)*/);
460 /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
461 num_piperegs += coredynp.issueW *
462 (2 * coredynp.int_data_width + pow(2.0, opcode_length)
463 /*+2*powers (2,reg_length)*/);
464 /* pipe stage EXE/MEM, data need to be read/write, address*/
465 //memory Opcode still need to be passed
466 num_piperegs += coredynp.issueW *
467 (coredynp.int_data_width + coredynp.v_address_width +
468 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
469 /* pipe stage MEM/WB; result data, writeback regs */
470 num_piperegs += coredynp.issueW *
471 (coredynp.int_data_width + coredynp.phy_ireg_width
472 /* powers (2,opcode_length) +
473 (2,opcode_length)+2*powers (2,reg_length)*/);
474 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
475 num_piperegs += coredynp.commitW *
476 (coredynp.int_data_width + coredynp.v_address_width +
477 coredynp.phy_ireg_width
478 /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
479 coredynp.num_hthreads;
480 num_stages = 12;
481
482 }
483
484 /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
485 num_piperegs = num_piperegs * 1.5;
486 tot_stage_vector = num_piperegs;
487 per_stage_vector = tot_stage_vector / num_stages;
488
489 if (coredynp.core_ty == Inorder) {
490 if (coredynp.pipeline_stages > 6)
491 num_piperegs = per_stage_vector * coredynp.pipeline_stages;
492 } else { //OOO
493 if (coredynp.pipeline_stages > 12)
494 num_piperegs = per_stage_vector * coredynp.pipeline_stages;
495 }
496 }
497
498 }
499
500 FunctionalUnit::FunctionalUnit(XMLNode* _xml_data,
501 InputParameter* interface_ip_,
502 const CoreParameters & _core_params,
503 const CoreStatistics & _core_stats,
504 enum FU_type fu_type_)
505 : McPATComponent(_xml_data),
506 interface_ip(*interface_ip_), core_params(_core_params),
507 core_stats(_core_stats), fu_type(fu_type_) {
508 double area_t;
509 double leakage;
510 double gate_leakage;
511 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
512 clockRate = core_params.clockRate;
513
514 uca_org_t result2;
515 // Temp name for the following function call
516 name = "Functional Unit";
517
518 result2 = init_interface(&interface_ip, name);
519
520 if (core_params.Embedded) {
521 if (fu_type == FPU) {
522 num_fu=core_params.num_fpus;
523 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
524 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
525 //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
526 if (g_ip->F_sz_nm>90)
527 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
528 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
529 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
530 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
531 // base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
532 // base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
533 base_energy = 0;
534 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
535 //FPU power from Sandia's processor sizing tech report
536 FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
537 } else if (fu_type == ALU) {
538 num_fu=core_params.num_alus;
539 area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
540 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
541 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
542 // base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
543 // base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
544 base_energy = 0;
545 per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
546 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
547
548 } else if (fu_type == MUL) {
549 num_fu=core_params.num_muls;
550 area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
551 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
552 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
553 // base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
554 // base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
555 base_energy = 0;
556 per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
557 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
558 } else {
559 cout<<"Unknown Functional Unit Type"<<endl;
560 exit(0);
561 }
562 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
563 } else {
564 if (fu_type == FPU) {
565 name = "Floating Point Unit(s)";
566 num_fu = core_params.num_fpus;
567 area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
568 90.0);//this is um^2
569 if (g_ip->F_sz_nm > 90)
570 area_t = 8.47 * 1e6 *
571 g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
572 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
573 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
574 //W The base energy of ALU average numbers from Intel 4G and
575 //773Mhz (Wattch)
576 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3;
577 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
578 1.2);
579 per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
580 FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
581 } else if (fu_type == ALU) {
582 name = "Integer ALU(s)";
583 num_fu = core_params.num_alus;
584 //this is um^2 ALU + MUl
585 area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff;
586 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
587 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
588 //W The base energy of ALU average numbers from Intel 4G and 773Mhz
589 //(Wattch)
590 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3;
591 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
592 1.2);
593 per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
594 FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
595 } else if (fu_type == MUL) {
596 name = "Multiply/Divide Unit(s)";
597 num_fu = core_params.num_muls;
598 //this is um^2 ALU + MUl
599 area_t = 280 * 260 * 2 * 3 *
600 g_tp.scaling_factor.logic_scaling_co_eff;
601 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
602 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
603 //W The base energy of ALU average numbers from Intel 4G and 773Mhz
604 //(Wattch)
605 base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2;
606 base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
607 1.2);
608 per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
609 FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
610 } else {
611 cout << "Unknown Functional Unit Type" << endl;
612 exit(0);
613 }
614 }
615
616 area.set_area(area_t*num_fu);
617 power.readOp.leakage = leakage * num_fu;
618 power.readOp.gate_leakage = gate_leakage * num_fu;
619
620 double long_channel_device_reduction =
621 longer_channel_device_reduction(Core_device, core_params.core_ty);
622 power.readOp.longer_channel_leakage =
623 power.readOp.leakage * long_channel_device_reduction;
624 double macro_layout_overhead = g_tp.macro_layout_overhead;
625 area.set_area(area.get_area()*macro_layout_overhead);
626 }
627
628 void FunctionalUnit::computeEnergy() {
629 double pppm_t[4] = {1, 1, 1, 1};
630 double FU_duty_cycle;
631 double sckRation = g_tp.sckt_co_eff;
632
633 // TDP power calculation
634 //2 means two source operands needs to be passed for each int instruction.
635 set_pppm(pppm_t, 2, 2, 2, 2);
636 tdp_stats.readAc.access = num_fu;
637 if (fu_type == FPU) {
638 FU_duty_cycle = core_stats.FPU_duty_cycle;
639 } else if (fu_type == ALU) {
640 FU_duty_cycle = core_stats.ALU_duty_cycle;
641 } else if (fu_type == MUL) {
642 FU_duty_cycle = core_stats.MUL_duty_cycle;
643 }
644
645 power.readOp.dynamic =
646 per_access_energy * tdp_stats.readAc.access + base_energy / clockRate;
647 power.readOp.dynamic *= sckRation * FU_duty_cycle;
648
649 // Runtime power calculation
650 if (fu_type == FPU) {
651 rtp_stats.readAc.access = core_stats.fpu_accesses;
652 } else if (fu_type == ALU) {
653 rtp_stats.readAc.access = core_stats.ialu_accesses;
654 } else if (fu_type == MUL) {
655 rtp_stats.readAc.access = core_stats.mul_accesses;
656 }
657
658 rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access +
659 base_energy * execution_time;
660 rt_power.readOp.dynamic *= sckRation;
661
662 output_data.area = area.get_area() / 1e6;
663 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
664 output_data.subthreshold_leakage_power =
665 (longer_channel_device) ? power.readOp.longer_channel_leakage :
666 power.readOp.leakage;
667 output_data.gate_leakage_power = power.readOp.gate_leakage;
668 output_data.runtime_dynamic_energy = rt_power.readOp.dynamic;
669 }
670
671 void FunctionalUnit::leakage_feedback(double temperature)
672 {
673 // Update the temperature and initialize the global interfaces.
674 interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
675
676 // init_result is dummy
677 uca_org_t init_result = init_interface(&interface_ip, name);
678
679 // This is part of FunctionalUnit()
680 double area_t, leakage, gate_leakage;
681 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
682
683 if (fu_type == FPU)
684 {
685 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
686 if (g_ip->F_sz_nm>90)
687 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
688 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
689 gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
690 }
691 else if (fu_type == ALU)
692 {
693 area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
694 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
695 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
696 }
697 else if (fu_type == MUL)
698 {
699 area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
700 leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
701 gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
702 }
703 else
704 {
705 cout<<"Unknown Functional Unit Type"<<endl;
706 exit(1);
707 }
708
709 power.readOp.leakage = leakage*num_fu;
710 power.readOp.gate_leakage = gate_leakage*num_fu;
711 power.readOp.longer_channel_leakage =
712 longer_channel_device_reduction(Core_device, core_params.core_ty);
713 }
714
715 UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_,
716 const CoreParameters & dyn_p_,
717 bool exist_)
718 : McPATComponent(_xml_data),
719 interface_ip(*interface_ip_), coredynp(dyn_p_),
720 core_ty(coredynp.core_ty), embedded(coredynp.Embedded),
721 pipeline_stage(coredynp.pipeline_stages),
722 num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW),
723 exist(exist_) {
724 if (!exist) return;
725
726 name = "Undifferentiated Core";
727 clockRate = coredynp.clockRate;
728
729 double undifferentiated_core = 0;
730 double core_tx_density = 0;
731 double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
732 double undifferentiated_core_coe;
733 uca_org_t result2;
734 result2 = init_interface(&interface_ip, name);
735
736 //Compute undifferentiated core area at 90nm.
737 if (embedded == false) {
738 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
739 if (core_ty == OOO) {
740 undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ?
741 (3.57 * log(pipeline_stage) - 1.2643) : 0;
742 } else if (core_ty == Inorder) {
743 undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ?
744 (-2.19 * log(pipeline_stage) + 6.55) : 0;
745 } else {
746 cout << "invalid core type" << endl;
747 exit(0);
748 }
749 undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716);
750 } else {
751 //Based on the results in paper "parametrized processor models" Sandia Labs
752 if (opt_for_clk)
753 undifferentiated_core_coe = 0.05;
754 else
755 undifferentiated_core_coe = 0;
756 undifferentiated_core = (0.4109 * pipeline_stage - 0.776) *
757 undifferentiated_core_coe;
758 undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426);
759 }
760
761 undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff *
762 1e6;//change from mm^2 to um^2
763 core_tx_density = g_tp.scaling_factor.core_tx_density;
764 power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
765 power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;
766
767 double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
768 power.readOp.longer_channel_leakage =
769 power.readOp.leakage * long_channel_device_reduction;
770 area.set_area(undifferentiated_core);
771
772 scktRatio = g_tp.sckt_co_eff;
773 power.readOp.dynamic *= scktRatio;
774 power.writeOp.dynamic *= scktRatio;
775 power.searchOp.dynamic *= scktRatio;
776 macro_PR_overhead = g_tp.macro_layout_overhead;
777 area.set_area(area.get_area()*macro_PR_overhead);
778
779 output_data.area = area.get_area() / 1e6;
780 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
781 output_data.subthreshold_leakage_power =
782 longer_channel_device ? power.readOp.longer_channel_leakage :
783 power.readOp.leakage;
784 output_data.gate_leakage_power = power.readOp.gate_leakage;
785 }
786
787 InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name,
788 bool _is_default,
789 const InputParameter *configure_interface,
790 int opcode_length_, int num_decoders_,
791 bool x86_,
792 double clockRate_,
793 enum Device_ty device_ty_,
794 enum Core_type core_ty_)
795 : McPATComponent(_xml_data), is_default(_is_default),
796 opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_),
797 device_ty(device_ty_), core_ty(core_ty_) {
798 /*
799 * Instruction decoder is different from n to 2^n decoders
800 * that are commonly used in row decoders in memory arrays.
801 * The RISC instruction decoder is typically a very simple device.
802 * We can decode an instruction by simply
803 * separating the machine word into small parts using wire slices
804 * The RISC instruction decoder can be approximate by the n to 2^n decoders,
805 * although this approximation usually underestimate power since each decoded
806 * instruction normally has more than 1 active signal.
807 *
808 * However, decoding a CISC instruction word is much more difficult
809 * than the RISC case. A CISC decoder is typically set up as a state machine.
810 * The machine reads the opcode field to determine
811 * what type of instruction it is,
812 * and where the other data values are.
813 * The instruction word is read in piece by piece,
814 * and decisions are made at each stage as to
815 * how the remainder of the instruction word will be read.
816 * (sequencer and ROM are usually needed)
817 * An x86 decoder can be even more complex since
818 * it involve both decoding instructions into u-ops and
819 * merge u-ops when doing micro-ops fusion.
820 */
821 name = _name;
822 clockRate = clockRate_;
823 bool is_dram = false;
824 double pmos_to_nmos_sizing_r;
825 double load_nmos_width, load_pmos_width;
826 double C_driver_load, R_wire_load;
827 Area cell;
828
829 l_ip = *configure_interface;
830 local_result = init_interface(&l_ip, name);
831 cell.h = g_tp.cell_h_def;
832 cell.w = g_tp.cell_h_def;
833
834 num_decoder_segments = (int)ceil(opcode_length / 18.0);
835 if (opcode_length > 18) opcode_length = 18;
836 num_decoded_signals = (int)pow(2.0, opcode_length);
837 pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
838 load_nmos_width = g_tp.max_w_nmos_ / 2;
839 load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
840 C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram);
841 R_wire_load = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
842
843 final_dec = new Decoder(
844 num_decoded_signals,
845 false,
846 C_driver_load,
847 R_wire_load,
848 false/*is_fa*/,
849 false/*is_dram*/,
850 false/*wl_tr*/, //to use peri device
851 cell);
852
853 PredecBlk * predec_blk1 = new PredecBlk(
854 num_decoded_signals,
855 final_dec,
856 0,//Assuming predec and dec are back to back
857 0,
858 1,//Each Predec only drives one final dec
859 false/*is_dram*/,
860 true);
861 PredecBlk * predec_blk2 = new PredecBlk(
862 num_decoded_signals,
863 final_dec,
864 0,//Assuming predec and dec are back to back
865 0,
866 1,//Each Predec only drives one final dec
867 false/*is_dram*/,
868 false);
869
870 PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
871 PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
872
873 pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2);
874
875 double area_decoder = final_dec->area.get_area() * num_decoded_signals *
876 num_decoder_segments * num_decoders;
877 //double w_decoder = area_decoder / area.get_h();
878 double area_pre_dec = (predec_blk_drv1->area.get_area() +
879 predec_blk_drv2->area.get_area() +
880 predec_blk1->area.get_area() +
881 predec_blk2->area.get_area()) *
882 num_decoder_segments * num_decoders;
883 area.set_area(area.get_area() + area_decoder + area_pre_dec);
884 double macro_layout_overhead = g_tp.macro_layout_overhead;
885 double chip_PR_overhead = g_tp.chip_layout_overhead;
886 area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
887
888 inst_decoder_delay_power();
889
890 double sckRation = g_tp.sckt_co_eff;
891 power.readOp.dynamic *= sckRation;
892 power.writeOp.dynamic *= sckRation;
893 power.searchOp.dynamic *= sckRation;
894
895 double long_channel_device_reduction =
896 longer_channel_device_reduction(device_ty, core_ty);
897 power.readOp.longer_channel_leakage = power.readOp.leakage *
898 long_channel_device_reduction;
899
900 output_data.area = area.get_area() / 1e6;
901 output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
902 output_data.subthreshold_leakage_power = power.readOp.leakage;
903 output_data.gate_leakage_power = power.readOp.gate_leakage;
904 }
905
906 void InstructionDecoder::inst_decoder_delay_power() {
907
908 double dec_outrisetime;
909 double inrisetime = 0, outrisetime;
910 double pppm_t[4] = {1, 1, 1, 1};
911 double squencer_passes = x86 ? 2 : 1;
912
913 outrisetime = pre_dec->compute_delays(inrisetime);
914 dec_outrisetime = final_dec->compute_delays(outrisetime);
915 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
916 power = power + pre_dec->power * pppm_t;
917 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
918 num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
919 power = power + final_dec->power * pppm_t;
920 }
921
922 void InstructionDecoder::leakage_feedback(double temperature) {
923 l_ip.temp = (unsigned int)round(temperature/10.0)*10;
924 uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
925
926 final_dec->leakage_feedback(temperature);
927 pre_dec->leakage_feedback(temperature);
928
929 double pppm_t[4] = {1,1,1,1};
930 double squencer_passes = x86?2:1;
931
932 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
933 power = pre_dec->power*pppm_t;
934
935 set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
936 power = power + final_dec->power*pppm_t;
937
938 double sckRation = g_tp.sckt_co_eff;
939
940 power.readOp.dynamic *= sckRation;
941 power.writeOp.dynamic *= sckRation;
942 power.searchOp.dynamic *= sckRation;
943
944 double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
945 power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
946 }
947
948 InstructionDecoder::~InstructionDecoder() {
949 local_result.cleanup();
950
951 delete final_dec;
952
953 delete pre_dec->blk1;
954 delete pre_dec->blk2;
955 delete pre_dec->drv1;
956 delete pre_dec->drv2;
957 delete pre_dec;
958 }