1 /*****************************************************************************
3 * SOFTWARE LICENSE AGREEMENT
4 * Copyright 2012 Hewlett-Packard Development Company, L.P.
5 * Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are
10 * met: redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer;
12 * redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution;
15 * neither the name of the copyright holders nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 ***************************************************************************/
37 selection_logic::selection_logic(XMLNode
* _xml_data
, bool _is_default
,
38 int _win_entries
, int issue_width_
,
39 const InputParameter
*configure_interface
,
40 string _name
, double _accesses
,
41 double clockRate_
, enum Device_ty device_ty_
,
42 enum Core_type core_ty_
)
43 : McPATComponent(_xml_data
), is_default(_is_default
),
44 win_entries(_win_entries
),
45 issue_width(issue_width_
),
47 device_ty(device_ty_
),
49 clockRate
= clockRate_
;
51 l_ip
= *configure_interface
;
52 local_result
= init_interface(&l_ip
, name
);
55 void selection_logic::computeArea() {
56 output_data
.area
= local_result
.area
;
59 void selection_logic::computeEnergy() {
60 //based on cost effective superscalar processor TR pp27-31
61 double Ctotal
, Cor
, Cpencode
;
63 double WSelORn
, WSelORprequ
, WSelPn
, WSelPp
, WSelEnn
, WSelEnp
;
65 //the 0.8um process data is used.
66 //this was 10 micron for the 0.8 micron process
67 WSelORn
= 12.5 * l_ip
.F_sz_um
;
68 //this was 40 micron for the 0.8 micron process
69 WSelORprequ
= 50 * l_ip
.F_sz_um
;
70 //this was 10mcron for the 0.8 micron process
71 WSelPn
= 12.5 * l_ip
.F_sz_um
;
72 //this was 15 micron for the 0.8 micron process
73 WSelPp
= 18.75 * l_ip
.F_sz_um
;
74 //this was 5 micron for the 0.8 micron process
75 WSelEnn
= 6.25 * l_ip
.F_sz_um
;
76 //this was 10 micron for the 0.8 micron process
77 WSelEnp
= 12.5 * l_ip
.F_sz_um
;
81 while (win_entries
> 4) {
82 win_entries
= (int)ceil((double)win_entries
/ 4.0);
83 num_arbiter
+= win_entries
;
85 //the 4-input OR logic to generate anyreq
86 Cor
= 4 * drain_C_(WSelORn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
87 drain_C_(WSelORprequ
, PCH
, 1, 1, g_tp
.cell_h_def
);
88 power
.readOp
.gate_leakage
=
89 cmos_Ig_leakage(WSelORn
, WSelORprequ
, 4, nor
) * g_tp
.peri_global
.Vdd
;
91 //The total capacity of the 4-bit priority encoder
92 Cpencode
= drain_C_(WSelPn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
93 drain_C_(WSelPp
, PCH
, 1, 1, g_tp
.cell_h_def
) +
94 2 * drain_C_(WSelPn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
95 drain_C_(WSelPp
, PCH
, 2, 1, g_tp
.cell_h_def
) +
96 3 * drain_C_(WSelPn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
97 drain_C_(WSelPp
, PCH
, 3, 1, g_tp
.cell_h_def
) +
98 4 * drain_C_(WSelPn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
99 drain_C_(WSelPp
, PCH
, 4, 1, g_tp
.cell_h_def
) +//precompute priority logic
100 2 * 4 * gate_C(WSelEnn
+ WSelEnp
, 20.0) +
101 4 * drain_C_(WSelEnn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
102 2 * 4 * drain_C_(WSelEnp
, PCH
, 1, 1, g_tp
.cell_h_def
) +//enable logic
103 (2 * 4 + 2 * 3 + 2 * 2 + 2) *
104 gate_C(WSelPn
+ WSelPp
, 10.0);//requests signal
106 Ctotal
+= issue_width
* num_arbiter
* (Cor
+ Cpencode
);
108 //2 means the abitration signal need to travel round trip
109 power
.readOp
.dynamic
=
110 Ctotal
* g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
* 2;
111 power
.readOp
.leakage
= issue_width
* num_arbiter
*
112 (cmos_Isub_leakage(WSelPn
, WSelPp
, 2, nor
)/*approximate precompute with a nor gate*///grant1p
113 + cmos_Isub_leakage(WSelPn
, WSelPp
, 3, nor
)//grant2p
114 + cmos_Isub_leakage(WSelPn
, WSelPp
, 4, nor
)//grant3p
115 + cmos_Isub_leakage(WSelEnn
, WSelEnp
, 2, nor
)*4//enable logic
116 + cmos_Isub_leakage(WSelEnn
, WSelEnp
, 1, inv
)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
117 ) * g_tp
.peri_global
.Vdd
;
118 power
.readOp
.gate_leakage
= issue_width
* num_arbiter
*
119 (cmos_Ig_leakage(WSelPn
, WSelPp
, 2, nor
)/*approximate precompute with a nor gate*///grant1p
120 + cmos_Ig_leakage(WSelPn
, WSelPp
, 3, nor
)//grant2p
121 + cmos_Ig_leakage(WSelPn
, WSelPp
, 4, nor
)//grant3p
122 + cmos_Ig_leakage(WSelEnn
, WSelEnp
, 2, nor
)*4//enable logic
123 + cmos_Ig_leakage(WSelEnn
, WSelEnp
, 1, inv
)*2*3//for each grant there are two inverters, there are 3 grant signals
124 ) * g_tp
.peri_global
.Vdd
;
125 double sckRation
= g_tp
.sckt_co_eff
;
126 power
.readOp
.dynamic
*= sckRation
;
127 power
.writeOp
.dynamic
*= sckRation
;
128 power
.searchOp
.dynamic
*= sckRation
;
130 double long_channel_device_reduction
=
131 longer_channel_device_reduction(device_ty
, core_ty
);
132 power
.readOp
.longer_channel_leakage
=
133 power
.readOp
.leakage
* long_channel_device_reduction
;
135 output_data
.peak_dynamic_power
= power
.readOp
.dynamic
* clockRate
;
136 output_data
.subthreshold_leakage_power
= power
.readOp
.leakage
;
137 output_data
.gate_leakage_power
= power
.readOp
.gate_leakage
;
138 output_data
.runtime_dynamic_energy
= power
.readOp
.dynamic
* accesses
;
141 dep_resource_conflict_check::dep_resource_conflict_check(
142 XMLNode
* _xml_data
, const string _name
,
143 const InputParameter
*configure_interface
,
144 const CoreParameters
& dyn_p_
, int compare_bits_
,
145 double clockRate_
, bool _is_default
)
146 : McPATComponent(_xml_data
), l_ip(*configure_interface
),
147 coredynp(dyn_p_
), compare_bits(compare_bits_
), is_default(_is_default
) {
150 clockRate
= clockRate_
;
151 //this was 20.0 micron for the 0.8 micron process
152 Wcompn
= 25 * l_ip
.F_sz_um
;
153 //this was 20.0 micron for the 0.8 micron process
154 Wevalinvp
= 25 * l_ip
.F_sz_um
;
155 //this was 80.0 mcron for the 0.8 micron process
156 Wevalinvn
= 100 * l_ip
.F_sz_um
;
157 //this was 40.0 micron for the 0.8 micron process
158 Wcomppreequ
= 50 * l_ip
.F_sz_um
;
159 //this was 5.4 micron for the 0.8 micron process
160 WNORn
= 6.75 * l_ip
.F_sz_um
;
161 //this was 30.5 micron for the 0.8 micron process
162 WNORp
= 38.125 * l_ip
.F_sz_um
;
164 // To make CACTI happy.
165 l_ip
.cache_sz
= MIN_BUFFER_SIZE
;
166 local_result
= init_interface(&l_ip
, name
);
168 if (coredynp
.core_ty
== Inorder
)
169 //TODO: opcode bits + log(shared resources) + REG TAG BITS -->
171 compare_bits
+= 16 + 8 + 8;
173 compare_bits
+= 16 + 8 + 8;
175 conflict_check_power();
176 double sckRation
= g_tp
.sckt_co_eff
;
177 power
.readOp
.dynamic
*= sckRation
;
178 power
.writeOp
.dynamic
*= sckRation
;
179 power
.searchOp
.dynamic
*= sckRation
;
183 void dep_resource_conflict_check::conflict_check_power() {
186 //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for
187 //dest to dest comparision.
188 num_comparators
= 3 * ((coredynp
.decodeW
) * (coredynp
.decodeW
) -
191 Ctotal
= num_comparators
* compare_cap();
193 power
.readOp
.dynamic
= Ctotal
* /*CLOCKRATE*/ g_tp
.peri_global
.Vdd
*
194 g_tp
.peri_global
.Vdd
/*AF*/;
195 power
.readOp
.leakage
= num_comparators
* compare_bits
* 2 *
196 simplified_nmos_leakage(Wcompn
, false);
198 double long_channel_device_reduction
=
199 longer_channel_device_reduction(Core_device
, coredynp
.core_ty
);
200 power
.readOp
.longer_channel_leakage
=
201 power
.readOp
.leakage
* long_channel_device_reduction
;
202 power
.readOp
.gate_leakage
= num_comparators
* compare_bits
* 2 *
203 cmos_Ig_leakage(Wcompn
, 0, 2, nmos
);
207 /* estimate comparator power consumption (this comparator is similar
208 to the tag-match structure in a CAM */
209 double dep_resource_conflict_check::compare_cap() {
212 //resize the big NOR gate at the DCL according to fan in.
213 WNORp
= WNORp
* compare_bits
/ 2.0;
214 /* bottom part of comparator */
215 c2
= (compare_bits
) * (drain_C_(Wcompn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
216 drain_C_(Wcompn
, NCH
, 2, 1, g_tp
.cell_h_def
)) +
217 drain_C_(Wevalinvp
, PCH
, 1, 1, g_tp
.cell_h_def
) +
218 drain_C_(Wevalinvn
, NCH
, 1, 1, g_tp
.cell_h_def
);
220 /* top part of comparator */
221 c1
= (compare_bits
) * (drain_C_(Wcompn
, NCH
, 1, 1, g_tp
.cell_h_def
) +
222 drain_C_(Wcompn
, NCH
, 2, 1, g_tp
.cell_h_def
) +
223 drain_C_(Wcomppreequ
, NCH
, 1, 1, g_tp
.cell_h_def
)) +
224 gate_C(WNORn
+ WNORp
, 10.0) +
225 drain_C_(WNORp
, NCH
, 2, 1, g_tp
.cell_h_def
) + compare_bits
*
226 drain_C_(WNORn
, NCH
, 2, 1, g_tp
.cell_h_def
);
231 void dep_resource_conflict_check::leakage_feedback(double temperature
)
233 l_ip
.temp
= (unsigned int)round(temperature
/10.0)*10;
234 uca_org_t init_result
= init_interface(&l_ip
, name
); // init_result is dummy
236 // This is part of conflict_check_power()
237 // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest
238 // to dest comparison.
239 int num_comparators
= 3 * ((coredynp
.decodeW
) * (coredynp
.decodeW
) -
241 power
.readOp
.leakage
= num_comparators
* compare_bits
* 2 *
242 simplified_nmos_leakage(Wcompn
, false);
244 double long_channel_device_reduction
=
245 longer_channel_device_reduction(Core_device
, coredynp
.core_ty
);
246 power
.readOp
.longer_channel_leakage
= power
.readOp
.leakage
*
247 long_channel_device_reduction
;
248 power
.readOp
.gate_leakage
= num_comparators
* compare_bits
* 2 *
249 cmos_Ig_leakage(Wcompn
, 0, 2, nmos
);
258 const InputParameter
*configure_interface
)
260 cell_load(_cell_load
),
261 WdecNANDn(_WdecNANDn
),
262 WdecNANDp(_WdecNANDp
) { //this model is based on the NAND2 based DFF.
263 l_ip
= *configure_interface
;
264 area
.set_area(5 * compute_gate_area(NAND
, 2,WdecNANDn
,WdecNANDp
,
266 + compute_gate_area(NAND
, 2,WdecNANDn
,WdecNANDn
,
273 double DFFCell::fpfp_node_cap(unsigned int fan_in
, unsigned int fan_out
) {
276 /* part 1: drain cap of NAND gate */
277 Ctotal
+= drain_C_(WdecNANDn
, NCH
, 2, 1, g_tp
.cell_h_def
, is_dram
) + fan_in
* drain_C_(WdecNANDp
, PCH
, 1, 1, g_tp
.cell_h_def
, is_dram
);
279 /* part 2: gate cap of NAND gates */
280 Ctotal
+= fan_out
* gate_C(WdecNANDn
+ WdecNANDp
, 0, is_dram
);
286 void DFFCell::compute_DFF_cell() {
287 double c1
, c2
, c3
, c4
, c5
, c6
;
288 /* node 5 and node 6 are identical to node 1 in capacitance */
289 c1
= c5
= c6
= fpfp_node_cap(2, 1);
290 c2
= fpfp_node_cap(2, 3);
291 c3
= fpfp_node_cap(3, 2);
292 c4
= fpfp_node_cap(2, 2);
294 //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
295 clock_cap
= 2 * gate_C(WdecNANDn
+ WdecNANDp
, 0, is_dram
);
296 e_switch
.readOp
.dynamic
+= (c4
+ c1
+ c2
+ c3
+ c5
+ c6
+ 2 * cell_load
) *
297 0.5 * g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
;;
299 /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
300 e_keep_1
.readOp
.dynamic
+=
301 c3
* g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
;
302 e_keep_0
.readOp
.dynamic
+=
303 c2
* g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
;
304 e_clock
.readOp
.dynamic
+=
305 clock_cap
* g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
;;
308 e_switch
.readOp
.leakage
+=
309 (cmos_Isub_leakage(WdecNANDn
, WdecNANDp
, 2, nand
) *
310 5//5 NAND2 and 1 NAND3 in a DFF
311 + cmos_Isub_leakage(WdecNANDn
, WdecNANDn
, 3, nand
)) *
312 g_tp
.peri_global
.Vdd
;
313 e_switch
.readOp
.gate_leakage
+=
314 (cmos_Ig_leakage(WdecNANDn
, WdecNANDp
, 2, nand
) *
315 5//5 NAND2 and 1 NAND3 in a DFF
316 + cmos_Ig_leakage(WdecNANDn
, WdecNANDn
, 3, nand
)) *
317 g_tp
.peri_global
.Vdd
;
320 Pipeline::Pipeline(XMLNode
* _xml_data
,
321 const InputParameter
*configure_interface
,
322 const CoreParameters
& dyn_p_
,
323 enum Device_ty device_ty_
,
324 bool _is_core_pipeline
,
326 : McPATComponent(_xml_data
), l_ip(*configure_interface
),
327 coredynp(dyn_p_
), device_ty(device_ty_
),
328 is_core_pipeline(_is_core_pipeline
), is_default(_is_default
),
332 local_result
= init_interface(&l_ip
, name
);
333 if (!coredynp
.Embedded
) {
338 //this was 20 micron for the 0.8 micron process
339 WNANDn
= (process_ind
) ? 25 * l_ip
.F_sz_um
: g_tp
.min_w_nmos_
;
340 //this was 30 micron for the 0.8 micron process
341 WNANDp
= (process_ind
) ? 37.5 * l_ip
.F_sz_um
: g_tp
.min_w_nmos_
*
342 pmos_to_nmos_sz_ratio();
343 load_per_pipeline_stage
= 2 * gate_C(WNANDn
+ WNANDp
, 0, false);
348 void Pipeline::compute() {
349 compute_stage_vector();
350 DFFCell
pipe_reg(false, WNANDn
, WNANDp
, load_per_pipeline_stage
, &l_ip
);
351 pipe_reg
.compute_DFF_cell();
353 double clock_power_pipereg
= num_piperegs
* pipe_reg
.e_clock
.readOp
.dynamic
;
354 //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
355 //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
356 double pipe_reg_power
= num_piperegs
*
357 (pipe_reg
.e_switch
.readOp
.dynamic
+ pipe_reg
.e_keep_0
.readOp
.dynamic
+
358 pipe_reg
.e_keep_1
.readOp
.dynamic
) / 3 + clock_power_pipereg
;
359 double pipe_reg_leakage
= num_piperegs
* pipe_reg
.e_switch
.readOp
.leakage
;
360 double pipe_reg_gate_leakage
= num_piperegs
*
361 pipe_reg
.e_switch
.readOp
.gate_leakage
;
362 power
.readOp
.dynamic
+= pipe_reg_power
;
363 power
.readOp
.leakage
+= pipe_reg_leakage
;
364 power
.readOp
.gate_leakage
+= pipe_reg_gate_leakage
;
365 area
.set_area(num_piperegs
* pipe_reg
.area
.get_area());
367 double long_channel_device_reduction
=
368 longer_channel_device_reduction(device_ty
, coredynp
.core_ty
);
369 power
.readOp
.longer_channel_leakage
= power
.readOp
.leakage
*
370 long_channel_device_reduction
;
373 double sckRation
= g_tp
.sckt_co_eff
;
374 power
.readOp
.dynamic
*= sckRation
;
375 power
.writeOp
.dynamic
*= sckRation
;
376 power
.searchOp
.dynamic
*= sckRation
;
377 double macro_layout_overhead
= g_tp
.macro_layout_overhead
;
378 if (!coredynp
.Embedded
)
379 area
.set_area(area
.get_area() * macro_layout_overhead
);
381 output_data
.area
= area
.get_area() / 1e6
;
382 output_data
.peak_dynamic_power
= power
.readOp
.dynamic
* clockRate
;
383 output_data
.subthreshold_leakage_power
= power
.readOp
.leakage
;
384 output_data
.gate_leakage_power
= power
.readOp
.gate_leakage
;
385 output_data
.runtime_dynamic_energy
= power
.readOp
.dynamic
* total_cycles
;
388 void Pipeline::compute_stage_vector() {
389 double num_stages
, tot_stage_vector
, per_stage_vector
;
390 int opcode_length
= coredynp
.x86
?
391 coredynp
.micro_opcode_length
: coredynp
.opcode_width
;
393 if (!is_core_pipeline
) {
394 //The number of pipeline stages are calculated based on the achievable
395 //throughput and required throughput
396 num_piperegs
= l_ip
.pipeline_stages
* l_ip
.per_stage_vector
;
398 if (coredynp
.core_ty
== Inorder
) {
399 /* assume 6 pipe stages and try to estimate bits per pipe stage */
400 /* pipe stage 0/IF */
401 num_piperegs
+= coredynp
.pc_width
* 2 * coredynp
.num_hthreads
;
402 /* pipe stage IF/ID */
403 num_piperegs
+= coredynp
.fetchW
*
404 (coredynp
.instruction_length
+ coredynp
.pc_width
) *
405 coredynp
.num_hthreads
;
406 /* pipe stage IF/ThreadSEL */
407 if (coredynp
.multithreaded
) {
408 num_piperegs
+= coredynp
.num_hthreads
*
409 coredynp
.perThreadState
; //8 bit thread states
411 /* pipe stage ID/EXE */
412 num_piperegs
+= coredynp
.decodeW
*
413 (coredynp
.instruction_length
+ coredynp
.pc_width
+
414 pow(2.0, opcode_length
) + 2 * coredynp
.int_data_width
) *
415 coredynp
.num_hthreads
;
416 /* pipe stage EXE/MEM */
417 num_piperegs
+= coredynp
.issueW
*
418 (3 * coredynp
.arch_ireg_width
+ pow(2.0, opcode_length
) + 8 *
419 2 * coredynp
.int_data_width
/*+2*powers (2,reg_length)*/);
420 /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
421 num_piperegs
+= coredynp
.issueW
*
422 (2 * coredynp
.int_data_width
+ pow(2.0, opcode_length
) + 8 *
423 2 * coredynp
.int_data_width
/*+2*powers (2,reg_length)*/);
426 /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
427 /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
431 coredynp
.pc_width
* 2 * coredynp
.num_hthreads
;//PC and Next PC
432 /* pipe stage IF/ID */
433 num_piperegs
+= coredynp
.fetchW
*
434 (coredynp
.instruction_length
+ coredynp
.pc_width
) *
435 coredynp
.num_hthreads
;//PC is used to feed branch predictor in ID
436 /* pipe stage 1D/Renaming*/
437 num_piperegs
+= coredynp
.decodeW
*
438 (coredynp
.instruction_length
+ coredynp
.pc_width
) *
439 coredynp
.num_hthreads
;//PC is for branch exe in later stage.
440 /* pipe stage Renaming/wire_drive */
441 num_piperegs
+= coredynp
.decodeW
*
442 (coredynp
.instruction_length
+ coredynp
.pc_width
);
443 /* pipe stage Renaming/IssueQ */
444 //3*coredynp.phy_ireg_width means 2 sources and 1 dest
445 num_piperegs
+= coredynp
.issueW
*
446 (coredynp
.instruction_length
+ coredynp
.pc_width
+ 3 *
447 coredynp
.phy_ireg_width
) * coredynp
.num_hthreads
;
448 /* pipe stage IssueQ/Dispatch */
449 num_piperegs
+= coredynp
.issueW
*
450 (coredynp
.instruction_length
+ 3 * coredynp
.phy_ireg_width
);
451 /* pipe stage Dispatch/EXE */
453 num_piperegs
+= coredynp
.issueW
*
454 (3 * coredynp
.phy_ireg_width
+ coredynp
.pc_width
+
455 pow(2.0, opcode_length
)/*+2*powers (2,reg_length)*/);
456 /* 2^opcode_length means the total decoded signal for the opcode*/
457 num_piperegs
+= coredynp
.issueW
*
458 (2 * coredynp
.int_data_width
+ pow(2.0, opcode_length
)
459 /*+2*powers (2,reg_length)*/);
460 /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
461 num_piperegs
+= coredynp
.issueW
*
462 (2 * coredynp
.int_data_width
+ pow(2.0, opcode_length
)
463 /*+2*powers (2,reg_length)*/);
464 /* pipe stage EXE/MEM, data need to be read/write, address*/
465 //memory Opcode still need to be passed
466 num_piperegs
+= coredynp
.issueW
*
467 (coredynp
.int_data_width
+ coredynp
.v_address_width
+
468 pow(2.0, opcode_length
)/*+2*powers (2,reg_length)*/);
469 /* pipe stage MEM/WB; result data, writeback regs */
470 num_piperegs
+= coredynp
.issueW
*
471 (coredynp
.int_data_width
+ coredynp
.phy_ireg_width
472 /* powers (2,opcode_length) +
473 (2,opcode_length)+2*powers (2,reg_length)*/);
474 /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
475 num_piperegs
+= coredynp
.commitW
*
476 (coredynp
.int_data_width
+ coredynp
.v_address_width
+
477 coredynp
.phy_ireg_width
478 /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
479 coredynp
.num_hthreads
;
484 /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
485 num_piperegs
= num_piperegs
* 1.5;
486 tot_stage_vector
= num_piperegs
;
487 per_stage_vector
= tot_stage_vector
/ num_stages
;
489 if (coredynp
.core_ty
== Inorder
) {
490 if (coredynp
.pipeline_stages
> 6)
491 num_piperegs
= per_stage_vector
* coredynp
.pipeline_stages
;
493 if (coredynp
.pipeline_stages
> 12)
494 num_piperegs
= per_stage_vector
* coredynp
.pipeline_stages
;
500 FunctionalUnit::FunctionalUnit(XMLNode
* _xml_data
,
501 InputParameter
* interface_ip_
,
502 const CoreParameters
& _core_params
,
503 const CoreStatistics
& _core_stats
,
504 enum FU_type fu_type_
)
505 : McPATComponent(_xml_data
),
506 interface_ip(*interface_ip_
), core_params(_core_params
),
507 core_stats(_core_stats
), fu_type(fu_type_
) {
511 double pmos_to_nmos_sizing_r
= pmos_to_nmos_sz_ratio();
512 clockRate
= core_params
.clockRate
;
515 // Temp name for the following function call
516 name
= "Functional Unit";
518 result2
= init_interface(&interface_ip
, name
);
520 if (core_params
.Embedded
) {
521 if (fu_type
== FPU
) {
522 num_fu
=core_params
.num_fpus
;
523 //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
524 area_t
= 4.47*1e6
*(g_ip
->F_sz_nm
*g_ip
->F_sz_nm
/90.0/90.0);//this is um^2 The base number
525 //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
526 if (g_ip
->F_sz_nm
>90)
527 area_t
= 4.47*1e6
*g_tp
.scaling_factor
.logic_scaling_co_eff
;//this is um^2
528 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
529 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
530 //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
531 // base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
532 // base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
534 per_access_energy
= 1.15/1e9
/4/1.3/1.3*g_tp
.peri_global
.Vdd
*g_tp
.peri_global
.Vdd
*(g_ip
->F_sz_nm
/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
535 //FPU power from Sandia's processor sizing tech report
536 FU_height
=(18667*num_fu
)*interface_ip
.F_sz_um
;//FPU from Sun's data
537 } else if (fu_type
== ALU
) {
538 num_fu
=core_params
.num_alus
;
539 area_t
= 280*260*g_tp
.scaling_factor
.logic_scaling_co_eff
;//this is um^2 ALU + MUl
540 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
541 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;
542 // base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
543 // base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
545 per_access_energy
= 1.15/3/1e9
/4/1.3/1.3*g_tp
.peri_global
.Vdd
*g_tp
.peri_global
.Vdd
*(g_ip
->F_sz_nm
/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
546 FU_height
=(6222*num_fu
)*interface_ip
.F_sz_um
;//integer ALU
548 } else if (fu_type
== MUL
) {
549 num_fu
=core_params
.num_muls
;
550 area_t
= 280*260*3*g_tp
.scaling_factor
.logic_scaling_co_eff
;//this is um^2 ALU + MUl
551 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
552 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;
553 // base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
554 // base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
556 per_access_energy
= 1.15*2/3/1e9
/4/1.3/1.3*g_tp
.peri_global
.Vdd
*g_tp
.peri_global
.Vdd
*(g_ip
->F_sz_nm
/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
557 FU_height
=(9334*num_fu
)*interface_ip
.F_sz_um
;//divider/mul from Sun's data
559 cout
<<"Unknown Functional Unit Type"<<endl
;
562 per_access_energy
*=0.5;//According to ARM data embedded processor has much lower per acc energy
564 if (fu_type
== FPU
) {
565 name
= "Floating Point Unit(s)";
566 num_fu
= core_params
.num_fpus
;
567 area_t
= 8.47 * 1e6
* (g_ip
->F_sz_nm
* g_ip
->F_sz_nm
/ 90.0 /
569 if (g_ip
->F_sz_nm
> 90)
570 area_t
= 8.47 * 1e6
*
571 g_tp
.scaling_factor
.logic_scaling_co_eff
;//this is um^2
572 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
573 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
574 //W The base energy of ALU average numbers from Intel 4G and
576 base_energy
= core_params
.core_ty
== Inorder
? 0 : 89e-3 * 3;
577 base_energy
*= (g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
/ 1.2 /
579 per_access_energy
= 1.15*3/1e9
/4/1.3/1.3*g_tp
.peri_global
.Vdd
*g_tp
.peri_global
.Vdd
*(g_ip
->F_sz_nm
/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
580 FU_height
=(38667*num_fu
)*interface_ip
.F_sz_um
;//FPU from Sun's data
581 } else if (fu_type
== ALU
) {
582 name
= "Integer ALU(s)";
583 num_fu
= core_params
.num_alus
;
584 //this is um^2 ALU + MUl
585 area_t
= 280 * 260 * 2 * g_tp
.scaling_factor
.logic_scaling_co_eff
;
586 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
587 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;
588 //W The base energy of ALU average numbers from Intel 4G and 773Mhz
590 base_energy
= core_params
.core_ty
== Inorder
? 0 : 89e-3;
591 base_energy
*= (g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
/ 1.2 /
593 per_access_energy
= 1.15/1e9
/4/1.3/1.3*g_tp
.peri_global
.Vdd
*g_tp
.peri_global
.Vdd
*(g_ip
->F_sz_nm
/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
594 FU_height
=(6222*num_fu
)*interface_ip
.F_sz_um
;//integer ALU
595 } else if (fu_type
== MUL
) {
596 name
= "Multiply/Divide Unit(s)";
597 num_fu
= core_params
.num_muls
;
598 //this is um^2 ALU + MUl
599 area_t
= 280 * 260 * 2 * 3 *
600 g_tp
.scaling_factor
.logic_scaling_co_eff
;
601 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
602 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;
603 //W The base energy of ALU average numbers from Intel 4G and 773Mhz
605 base_energy
= core_params
.core_ty
== Inorder
? 0 : 89e-3 * 2;
606 base_energy
*= (g_tp
.peri_global
.Vdd
* g_tp
.peri_global
.Vdd
/ 1.2 /
608 per_access_energy
= 1.15*2/1e9
/4/1.3/1.3*g_tp
.peri_global
.Vdd
*g_tp
.peri_global
.Vdd
*(g_ip
->F_sz_nm
/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
609 FU_height
=(9334*num_fu
)*interface_ip
.F_sz_um
;//divider/mul from Sun's data
611 cout
<< "Unknown Functional Unit Type" << endl
;
616 area
.set_area(area_t
*num_fu
);
617 power
.readOp
.leakage
= leakage
* num_fu
;
618 power
.readOp
.gate_leakage
= gate_leakage
* num_fu
;
620 double long_channel_device_reduction
=
621 longer_channel_device_reduction(Core_device
, core_params
.core_ty
);
622 power
.readOp
.longer_channel_leakage
=
623 power
.readOp
.leakage
* long_channel_device_reduction
;
624 double macro_layout_overhead
= g_tp
.macro_layout_overhead
;
625 area
.set_area(area
.get_area()*macro_layout_overhead
);
628 void FunctionalUnit::computeEnergy() {
629 double pppm_t
[4] = {1, 1, 1, 1};
630 double FU_duty_cycle
;
631 double sckRation
= g_tp
.sckt_co_eff
;
633 // TDP power calculation
634 //2 means two source operands needs to be passed for each int instruction.
635 set_pppm(pppm_t
, 2, 2, 2, 2);
636 tdp_stats
.readAc
.access
= num_fu
;
637 if (fu_type
== FPU
) {
638 FU_duty_cycle
= core_stats
.FPU_duty_cycle
;
639 } else if (fu_type
== ALU
) {
640 FU_duty_cycle
= core_stats
.ALU_duty_cycle
;
641 } else if (fu_type
== MUL
) {
642 FU_duty_cycle
= core_stats
.MUL_duty_cycle
;
645 power
.readOp
.dynamic
=
646 per_access_energy
* tdp_stats
.readAc
.access
+ base_energy
/ clockRate
;
647 power
.readOp
.dynamic
*= sckRation
* FU_duty_cycle
;
649 // Runtime power calculation
650 if (fu_type
== FPU
) {
651 rtp_stats
.readAc
.access
= core_stats
.fpu_accesses
;
652 } else if (fu_type
== ALU
) {
653 rtp_stats
.readAc
.access
= core_stats
.ialu_accesses
;
654 } else if (fu_type
== MUL
) {
655 rtp_stats
.readAc
.access
= core_stats
.mul_accesses
;
658 rt_power
.readOp
.dynamic
= per_access_energy
* rtp_stats
.readAc
.access
+
659 base_energy
* execution_time
;
660 rt_power
.readOp
.dynamic
*= sckRation
;
662 output_data
.area
= area
.get_area() / 1e6
;
663 output_data
.peak_dynamic_power
= power
.readOp
.dynamic
* clockRate
;
664 output_data
.subthreshold_leakage_power
=
665 (longer_channel_device
) ? power
.readOp
.longer_channel_leakage
:
666 power
.readOp
.leakage
;
667 output_data
.gate_leakage_power
= power
.readOp
.gate_leakage
;
668 output_data
.runtime_dynamic_energy
= rt_power
.readOp
.dynamic
;
671 void FunctionalUnit::leakage_feedback(double temperature
)
673 // Update the temperature and initialize the global interfaces.
674 interface_ip
.temp
= (unsigned int)round(temperature
/10.0)*10;
676 // init_result is dummy
677 uca_org_t init_result
= init_interface(&interface_ip
, name
);
679 // This is part of FunctionalUnit()
680 double area_t
, leakage
, gate_leakage
;
681 double pmos_to_nmos_sizing_r
= pmos_to_nmos_sz_ratio();
685 area_t
= 4.47*1e6
*(g_ip
->F_sz_nm
*g_ip
->F_sz_nm
/90.0/90.0);//this is um^2 The base number
686 if (g_ip
->F_sz_nm
>90)
687 area_t
= 4.47*1e6
*g_tp
.scaling_factor
.logic_scaling_co_eff
;//this is um^2
688 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
689 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
691 else if (fu_type
== ALU
)
693 area_t
= 280*260*2*num_fu
*g_tp
.scaling_factor
.logic_scaling_co_eff
;//this is um^2 ALU + MUl
694 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
695 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;
697 else if (fu_type
== MUL
)
699 area_t
= 280*260*2*3*num_fu
*g_tp
.scaling_factor
.logic_scaling_co_eff
;//this is um^2 ALU + MUl
700 leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Isub_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;//unit W
701 gate_leakage
= area_t
*(g_tp
.scaling_factor
.core_tx_density
)*cmos_Ig_leakage(20*g_tp
.min_w_nmos_
, 20*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
/2;
705 cout
<<"Unknown Functional Unit Type"<<endl
;
709 power
.readOp
.leakage
= leakage
*num_fu
;
710 power
.readOp
.gate_leakage
= gate_leakage
*num_fu
;
711 power
.readOp
.longer_channel_leakage
=
712 longer_channel_device_reduction(Core_device
, core_params
.core_ty
);
715 UndiffCore::UndiffCore(XMLNode
* _xml_data
, InputParameter
* interface_ip_
,
716 const CoreParameters
& dyn_p_
,
718 : McPATComponent(_xml_data
),
719 interface_ip(*interface_ip_
), coredynp(dyn_p_
),
720 core_ty(coredynp
.core_ty
), embedded(coredynp
.Embedded
),
721 pipeline_stage(coredynp
.pipeline_stages
),
722 num_hthreads(coredynp
.num_hthreads
), issue_width(coredynp
.issueW
),
726 name
= "Undifferentiated Core";
727 clockRate
= coredynp
.clockRate
;
729 double undifferentiated_core
= 0;
730 double core_tx_density
= 0;
731 double pmos_to_nmos_sizing_r
= pmos_to_nmos_sz_ratio();
732 double undifferentiated_core_coe
;
734 result2
= init_interface(&interface_ip
, name
);
736 //Compute undifferentiated core area at 90nm.
737 if (embedded
== false) {
738 //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
739 if (core_ty
== OOO
) {
740 undifferentiated_core
= (3.57 * log(pipeline_stage
) - 1.2643) > 0 ?
741 (3.57 * log(pipeline_stage
) - 1.2643) : 0;
742 } else if (core_ty
== Inorder
) {
743 undifferentiated_core
= (-2.19 * log(pipeline_stage
) + 6.55) > 0 ?
744 (-2.19 * log(pipeline_stage
) + 6.55) : 0;
746 cout
<< "invalid core type" << endl
;
749 undifferentiated_core
*= (1 + logtwo(num_hthreads
) * 0.0716);
751 //Based on the results in paper "parametrized processor models" Sandia Labs
753 undifferentiated_core_coe
= 0.05;
755 undifferentiated_core_coe
= 0;
756 undifferentiated_core
= (0.4109 * pipeline_stage
- 0.776) *
757 undifferentiated_core_coe
;
758 undifferentiated_core
*= (1 + logtwo(num_hthreads
) * 0.0426);
761 undifferentiated_core
*= g_tp
.scaling_factor
.logic_scaling_co_eff
*
762 1e6
;//change from mm^2 to um^2
763 core_tx_density
= g_tp
.scaling_factor
.core_tx_density
;
764 power
.readOp
.leakage
= undifferentiated_core
*(core_tx_density
)*cmos_Isub_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
;//unit W
765 power
.readOp
.gate_leakage
= undifferentiated_core
*(core_tx_density
)*cmos_Ig_leakage(5*g_tp
.min_w_nmos_
, 5*g_tp
.min_w_nmos_
*pmos_to_nmos_sizing_r
, 1, inv
)*g_tp
.peri_global
.Vdd
;
767 double long_channel_device_reduction
= longer_channel_device_reduction(Core_device
, coredynp
.core_ty
);
768 power
.readOp
.longer_channel_leakage
=
769 power
.readOp
.leakage
* long_channel_device_reduction
;
770 area
.set_area(undifferentiated_core
);
772 scktRatio
= g_tp
.sckt_co_eff
;
773 power
.readOp
.dynamic
*= scktRatio
;
774 power
.writeOp
.dynamic
*= scktRatio
;
775 power
.searchOp
.dynamic
*= scktRatio
;
776 macro_PR_overhead
= g_tp
.macro_layout_overhead
;
777 area
.set_area(area
.get_area()*macro_PR_overhead
);
779 output_data
.area
= area
.get_area() / 1e6
;
780 output_data
.peak_dynamic_power
= power
.readOp
.dynamic
* clockRate
;
781 output_data
.subthreshold_leakage_power
=
782 longer_channel_device
? power
.readOp
.longer_channel_leakage
:
783 power
.readOp
.leakage
;
784 output_data
.gate_leakage_power
= power
.readOp
.gate_leakage
;
787 InstructionDecoder::InstructionDecoder(XMLNode
* _xml_data
, const string _name
,
789 const InputParameter
*configure_interface
,
790 int opcode_length_
, int num_decoders_
,
793 enum Device_ty device_ty_
,
794 enum Core_type core_ty_
)
795 : McPATComponent(_xml_data
), is_default(_is_default
),
796 opcode_length(opcode_length_
), num_decoders(num_decoders_
), x86(x86_
),
797 device_ty(device_ty_
), core_ty(core_ty_
) {
799 * Instruction decoder is different from n to 2^n decoders
800 * that are commonly used in row decoders in memory arrays.
801 * The RISC instruction decoder is typically a very simple device.
802 * We can decode an instruction by simply
803 * separating the machine word into small parts using wire slices
804 * The RISC instruction decoder can be approximate by the n to 2^n decoders,
805 * although this approximation usually underestimate power since each decoded
806 * instruction normally has more than 1 active signal.
808 * However, decoding a CISC instruction word is much more difficult
809 * than the RISC case. A CISC decoder is typically set up as a state machine.
810 * The machine reads the opcode field to determine
811 * what type of instruction it is,
812 * and where the other data values are.
813 * The instruction word is read in piece by piece,
814 * and decisions are made at each stage as to
815 * how the remainder of the instruction word will be read.
816 * (sequencer and ROM are usually needed)
817 * An x86 decoder can be even more complex since
818 * it involve both decoding instructions into u-ops and
819 * merge u-ops when doing micro-ops fusion.
822 clockRate
= clockRate_
;
823 bool is_dram
= false;
824 double pmos_to_nmos_sizing_r
;
825 double load_nmos_width
, load_pmos_width
;
826 double C_driver_load
, R_wire_load
;
829 l_ip
= *configure_interface
;
830 local_result
= init_interface(&l_ip
, name
);
831 cell
.h
= g_tp
.cell_h_def
;
832 cell
.w
= g_tp
.cell_h_def
;
834 num_decoder_segments
= (int)ceil(opcode_length
/ 18.0);
835 if (opcode_length
> 18) opcode_length
= 18;
836 num_decoded_signals
= (int)pow(2.0, opcode_length
);
837 pmos_to_nmos_sizing_r
= pmos_to_nmos_sz_ratio();
838 load_nmos_width
= g_tp
.max_w_nmos_
/ 2;
839 load_pmos_width
= g_tp
.max_w_nmos_
* pmos_to_nmos_sizing_r
;
840 C_driver_load
= 1024 * gate_C(load_nmos_width
+ load_pmos_width
, 0, is_dram
);
841 R_wire_load
= 3000 * l_ip
.F_sz_um
* g_tp
.wire_outside_mat
.R_per_um
;
843 final_dec
= new Decoder(
850 false/*wl_tr*/, //to use peri device
853 PredecBlk
* predec_blk1
= new PredecBlk(
856 0,//Assuming predec and dec are back to back
858 1,//Each Predec only drives one final dec
861 PredecBlk
* predec_blk2
= new PredecBlk(
864 0,//Assuming predec and dec are back to back
866 1,//Each Predec only drives one final dec
870 PredecBlkDrv
* predec_blk_drv1
= new PredecBlkDrv(0, predec_blk1
, false);
871 PredecBlkDrv
* predec_blk_drv2
= new PredecBlkDrv(0, predec_blk2
, false);
873 pre_dec
= new Predec(predec_blk_drv1
, predec_blk_drv2
);
875 double area_decoder
= final_dec
->area
.get_area() * num_decoded_signals
*
876 num_decoder_segments
* num_decoders
;
877 //double w_decoder = area_decoder / area.get_h();
878 double area_pre_dec
= (predec_blk_drv1
->area
.get_area() +
879 predec_blk_drv2
->area
.get_area() +
880 predec_blk1
->area
.get_area() +
881 predec_blk2
->area
.get_area()) *
882 num_decoder_segments
* num_decoders
;
883 area
.set_area(area
.get_area() + area_decoder
+ area_pre_dec
);
884 double macro_layout_overhead
= g_tp
.macro_layout_overhead
;
885 double chip_PR_overhead
= g_tp
.chip_layout_overhead
;
886 area
.set_area(area
.get_area()*macro_layout_overhead
*chip_PR_overhead
);
888 inst_decoder_delay_power();
890 double sckRation
= g_tp
.sckt_co_eff
;
891 power
.readOp
.dynamic
*= sckRation
;
892 power
.writeOp
.dynamic
*= sckRation
;
893 power
.searchOp
.dynamic
*= sckRation
;
895 double long_channel_device_reduction
=
896 longer_channel_device_reduction(device_ty
, core_ty
);
897 power
.readOp
.longer_channel_leakage
= power
.readOp
.leakage
*
898 long_channel_device_reduction
;
900 output_data
.area
= area
.get_area() / 1e6
;
901 output_data
.peak_dynamic_power
= power
.readOp
.dynamic
* clockRate
;
902 output_data
.subthreshold_leakage_power
= power
.readOp
.leakage
;
903 output_data
.gate_leakage_power
= power
.readOp
.gate_leakage
;
906 void InstructionDecoder::inst_decoder_delay_power() {
908 double dec_outrisetime
;
909 double inrisetime
= 0, outrisetime
;
910 double pppm_t
[4] = {1, 1, 1, 1};
911 double squencer_passes
= x86
? 2 : 1;
913 outrisetime
= pre_dec
->compute_delays(inrisetime
);
914 dec_outrisetime
= final_dec
->compute_delays(outrisetime
);
915 set_pppm(pppm_t
, squencer_passes
*num_decoder_segments
, num_decoder_segments
, squencer_passes
*num_decoder_segments
, num_decoder_segments
);
916 power
= power
+ pre_dec
->power
* pppm_t
;
917 set_pppm(pppm_t
, squencer_passes
*num_decoder_segments
, num_decoder_segments
*num_decoded_signals
,
918 num_decoder_segments
*num_decoded_signals
, squencer_passes
*num_decoder_segments
);
919 power
= power
+ final_dec
->power
* pppm_t
;
922 void InstructionDecoder::leakage_feedback(double temperature
) {
923 l_ip
.temp
= (unsigned int)round(temperature
/10.0)*10;
924 uca_org_t init_result
= init_interface(&l_ip
, name
); // init_result is dummy
926 final_dec
->leakage_feedback(temperature
);
927 pre_dec
->leakage_feedback(temperature
);
929 double pppm_t
[4] = {1,1,1,1};
930 double squencer_passes
= x86
?2:1;
932 set_pppm(pppm_t
, squencer_passes
*num_decoder_segments
, num_decoder_segments
, squencer_passes
*num_decoder_segments
, num_decoder_segments
);
933 power
= pre_dec
->power
*pppm_t
;
935 set_pppm(pppm_t
, squencer_passes
*num_decoder_segments
, num_decoder_segments
*num_decoded_signals
,num_decoder_segments
*num_decoded_signals
, squencer_passes
*num_decoder_segments
);
936 power
= power
+ final_dec
->power
*pppm_t
;
938 double sckRation
= g_tp
.sckt_co_eff
;
940 power
.readOp
.dynamic
*= sckRation
;
941 power
.writeOp
.dynamic
*= sckRation
;
942 power
.searchOp
.dynamic
*= sckRation
;
944 double long_channel_device_reduction
= longer_channel_device_reduction(device_ty
,core_ty
);
945 power
.readOp
.longer_channel_leakage
= power
.readOp
.leakage
*long_channel_device_reduction
;
948 InstructionDecoder::~InstructionDecoder() {
949 local_result
.cleanup();
953 delete pre_dec
->blk1
;
954 delete pre_dec
->blk2
;
955 delete pre_dec
->drv1
;
956 delete pre_dec
->drv2
;