ext/mcpat/logic.cc

   1 /*****************************************************************************
   2  *                                McPAT
   3  *                      SOFTWARE LICENSE AGREEMENT
   4  *            Copyright 2012 Hewlett-Packard Development Company, L.P.
   5  *            Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
   6  *                          All Rights Reserved
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions are
  10  * met: redistributions of source code must retain the above copyright
  11  * notice, this list of conditions and the following disclaimer;
  12  * redistributions in binary form must reproduce the above copyright
  13  * notice, this list of conditions and the following disclaimer in the
  14  * documentation and/or other materials provided with the distribution;
  15  * neither the name of the copyright holders nor the names of its
  16  * contributors may be used to endorse or promote products derived from
  17  * this software without specific prior written permission.
  18
  19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  23  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  ***************************************************************************/
  32
  33 #include "common.h"
  34 #include "logic.h"
  35
  36 //selection_logic
  37 selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default,
  38                                  int _win_entries, int issue_width_,
  39                                  const InputParameter *configure_interface,
  40                                  string _name, double _accesses,
  41                                  double clockRate_, enum Device_ty device_ty_,
  42                                  enum Core_type core_ty_)
  43     : McPATComponent(_xml_data), is_default(_is_default),
  44       win_entries(_win_entries),
  45       issue_width(issue_width_),
  46       accesses(_accesses),
  47       device_ty(device_ty_),
  48       core_ty(core_ty_) {
  49     clockRate = clockRate_;
  50     name = _name;
  51     l_ip = *configure_interface;
  52     local_result = init_interface(&l_ip, name);
  53 }
  54
  55 void selection_logic::computeArea() {
  56     output_data.area = local_result.area;
  57 }
  58
  59 void selection_logic::computeEnergy() {
  60     //based on cost effective superscalar processor TR pp27-31
  61     double Ctotal, Cor, Cpencode;
  62     int num_arbiter;
  63     double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
  64
  65     //the 0.8um process data is used.
  66     //this was 10 micron for the 0.8 micron process
  67     WSelORn     = 12.5 * l_ip.F_sz_um;
  68     //this was 40 micron for the 0.8 micron process
  69     WSelORprequ = 50 * l_ip.F_sz_um;
  70     //this was 10mcron for the 0.8 micron process
  71     WSelPn = 12.5 * l_ip.F_sz_um;
  72     //this was 15 micron for the 0.8 micron process
  73     WSelPp = 18.75 * l_ip.F_sz_um;
  74     //this was 5 micron for the 0.8 micron process
  75     WSelEnn     = 6.25 * l_ip.F_sz_um;
  76     //this was 10 micron for the 0.8 micron process
  77     WSelEnp     = 12.5 * l_ip.F_sz_um;
  78
  79     Ctotal = 0;
  80     num_arbiter = 1;
  81     while (win_entries > 4) {
  82         win_entries = (int)ceil((double)win_entries / 4.0);
  83         num_arbiter += win_entries;
  84     }
  85     //the 4-input OR logic to generate anyreq
  86     Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) +
  87         drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def);
  88     power.readOp.gate_leakage =
  89         cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;
  90
  91     //The total capacity of the 4-bit priority encoder
  92     Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
  93         drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) +
  94         2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
  95         drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) +
  96         3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
  97         drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) +
  98         4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
  99         drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic
 100         2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) +
 101         4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) +
 102         2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic
 103         (2 * 4 + 2 * 3 + 2 * 2 + 2) *
 104         gate_C(WSelPn + WSelPp, 10.0);//requests signal
 105
 106     Ctotal += issue_width * num_arbiter * (Cor + Cpencode);
 107
 108     //2 means the abitration signal need to travel round trip
 109     power.readOp.dynamic =
 110         Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2;
 111     power.readOp.leakage = issue_width * num_arbiter *
 112         (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
 113          + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
 114          + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
 115          + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
 116          + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
 117             ) * g_tp.peri_global.Vdd;
 118     power.readOp.gate_leakage = issue_width * num_arbiter *
 119         (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
 120          + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
 121          + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
 122          + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
 123          + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
 124             ) * g_tp.peri_global.Vdd;
 125     double sckRation = g_tp.sckt_co_eff;
 126     power.readOp.dynamic *= sckRation;
 127     power.writeOp.dynamic *= sckRation;
 128     power.searchOp.dynamic *= sckRation;
 129
 130     double long_channel_device_reduction =
 131         longer_channel_device_reduction(device_ty, core_ty);
 132     power.readOp.longer_channel_leakage =
 133         power.readOp.leakage * long_channel_device_reduction;
 134
 135     output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
 136     output_data.subthreshold_leakage_power = power.readOp.leakage;
 137     output_data.gate_leakage_power = power.readOp.gate_leakage;
 138     output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses;
 139 }
 140
 141 dep_resource_conflict_check::dep_resource_conflict_check(
 142     XMLNode* _xml_data, const string _name,
 143     const InputParameter *configure_interface,
 144     const CoreParameters & dyn_p_, int compare_bits_,
 145     double clockRate_, bool _is_default)
 146     : McPATComponent(_xml_data), l_ip(*configure_interface),
 147       coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) {
 148
 149     name = _name;
 150     clockRate = clockRate_;
 151     //this was 20.0 micron for the 0.8 micron process
 152     Wcompn = 25 * l_ip.F_sz_um;
 153     //this was 20.0 micron for the 0.8 micron process
 154     Wevalinvp = 25 * l_ip.F_sz_um;
 155     //this was 80.0 mcron for the 0.8 micron process
 156     Wevalinvn = 100 * l_ip.F_sz_um;
 157     //this was 40.0  micron for the 0.8 micron process
 158     Wcomppreequ = 50 * l_ip.F_sz_um;
 159     //this was 5.4 micron for the 0.8 micron process
 160     WNORn =     6.75 * l_ip.F_sz_um;
 161     //this was 30.5 micron for the 0.8 micron process
 162     WNORp =     38.125 * l_ip.F_sz_um;
 163
 164     // To make CACTI happy.
 165     l_ip.cache_sz = MIN_BUFFER_SIZE;
 166     local_result = init_interface(&l_ip, name);
 167
 168     if (coredynp.core_ty == Inorder)
 169         //TODO: opcode bits + log(shared resources) + REG TAG BITS -->
 170         //opcode comparator
 171         compare_bits += 16 + 8 + 8;
 172     else
 173         compare_bits += 16 + 8 + 8;
 174
 175     conflict_check_power();
 176     double sckRation = g_tp.sckt_co_eff;
 177     power.readOp.dynamic *= sckRation;
 178     power.writeOp.dynamic *= sckRation;
 179     power.searchOp.dynamic *= sckRation;
 180
 181 }
 182
 183 void dep_resource_conflict_check::conflict_check_power() {
 184     double Ctotal;
 185     int num_comparators;
 186     //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for
 187     //dest to dest comparision.
 188     num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
 189                            coredynp.decodeW);
 190
 191     Ctotal = num_comparators * compare_cap();
 192
 193     power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd *
 194         g_tp.peri_global.Vdd /*AF*/;
 195     power.readOp.leakage = num_comparators * compare_bits * 2 *
 196         simplified_nmos_leakage(Wcompn,  false);
 197
 198     double long_channel_device_reduction =
 199         longer_channel_device_reduction(Core_device, coredynp.core_ty);
 200     power.readOp.longer_channel_leakage =
 201         power.readOp.leakage * long_channel_device_reduction;
 202     power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
 203         cmos_Ig_leakage(Wcompn, 0, 2, nmos);
 204
 205 }
 206
 207 /* estimate comparator power consumption (this comparator is similar
 208    to the tag-match structure in a CAM */
 209 double dep_resource_conflict_check::compare_cap() {
 210     double c1, c2;
 211
 212     //resize the big NOR gate at the DCL according to fan in.
 213     WNORp = WNORp * compare_bits / 2.0;
 214     /* bottom part of comparator */
 215     c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
 216                            drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) +
 217         drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) +
 218         drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);
 219
 220     /* top part of comparator */
 221     c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
 222                            drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) +
 223                            drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) +
 224         gate_C(WNORn + WNORp, 10.0) +
 225         drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits *
 226         drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def);
 227     return(c1 + c2);
 228
 229 }
 230
 231 void dep_resource_conflict_check::leakage_feedback(double temperature)
 232 {
 233   l_ip.temp = (unsigned int)round(temperature/10.0)*10;
 234   uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
 235
 236   // This is part of conflict_check_power()
 237   // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest
 238   // to dest comparison.
 239   int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
 240                              coredynp.decodeW);
 241   power.readOp.leakage = num_comparators * compare_bits * 2 *
 242       simplified_nmos_leakage(Wcompn,  false);
 243
 244   double long_channel_device_reduction =
 245       longer_channel_device_reduction(Core_device, coredynp.core_ty);
 246   power.readOp.longer_channel_leakage = power.readOp.leakage *
 247       long_channel_device_reduction;
 248   power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
 249       cmos_Ig_leakage(Wcompn, 0, 2, nmos);
 250 }
 251
 252
 253 DFFCell::DFFCell(
 254     bool _is_dram,
 255     double _WdecNANDn,
 256     double _WdecNANDp,
 257     double _cell_load,
 258     const InputParameter *configure_interface)
 259         : is_dram(_is_dram),
 260         cell_load(_cell_load),
 261         WdecNANDn(_WdecNANDn),
 262         WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF.
 263     l_ip = *configure_interface;
 264     area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp,
 265                                         g_tp.cell_h_def)
 266                   + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn,
 267                                       g_tp.cell_h_def));
 268
 269
 270 }
 271
 272
 273 double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) {
 274     double Ctotal = 0;
 275
 276     /* part 1: drain cap of NAND gate */
 277     Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
 278
 279     /* part 2: gate cap of NAND gates */
 280     Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
 281
 282     return Ctotal;
 283 }
 284
 285
 286 void DFFCell::compute_DFF_cell() {
 287     double c1, c2, c3, c4, c5, c6;
 288     /* node 5 and node 6 are identical to node 1 in capacitance */
 289     c1 = c5 = c6 = fpfp_node_cap(2, 1);
 290     c2 = fpfp_node_cap(2, 3);
 291     c3 = fpfp_node_cap(3, 2);
 292     c4 = fpfp_node_cap(2, 2);
 293
 294     //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
 295     clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
 296     e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) *
 297         0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
 298
 299     /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
 300     e_keep_1.readOp.dynamic +=
 301         c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
 302     e_keep_0.readOp.dynamic +=
 303         c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
 304     e_clock.readOp.dynamic +=
 305         clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
 306
 307     /* static power */
 308     e_switch.readOp.leakage +=
 309         (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) *
 310          5//5 NAND2 and 1 NAND3 in a DFF
 311          + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
 312         g_tp.peri_global.Vdd;
 313     e_switch.readOp.gate_leakage +=
 314         (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) *
 315          5//5 NAND2 and 1 NAND3 in a DFF
 316          + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
 317         g_tp.peri_global.Vdd;
 318 }
 319
 320 Pipeline::Pipeline(XMLNode* _xml_data,
 321                    const InputParameter *configure_interface,
 322                    const CoreParameters & dyn_p_,
 323                    enum Device_ty device_ty_,
 324                    bool _is_core_pipeline,
 325                    bool _is_default)
 326     : McPATComponent(_xml_data), l_ip(*configure_interface),
 327       coredynp(dyn_p_), device_ty(device_ty_),
 328       is_core_pipeline(_is_core_pipeline), is_default(_is_default),
 329       num_piperegs(0.0) {
 330     name = "Pipeline?";
 331
 332     local_result = init_interface(&l_ip, name);
 333     if (!coredynp.Embedded) {
 334         process_ind = true;
 335     } else {
 336         process_ind = false;
 337     }
 338     //this was  20 micron for the 0.8 micron process
 339     WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;
 340     //this was  30 micron for the 0.8 micron process
 341     WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ *
 342         pmos_to_nmos_sz_ratio();
 343     load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false);
 344     compute();
 345
 346 }
 347
 348 void Pipeline::compute() {
 349     compute_stage_vector();
 350     DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip);
 351     pipe_reg.compute_DFF_cell();
 352
 353     double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
 354     //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
 355     //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
 356     double pipe_reg_power = num_piperegs *
 357         (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic +
 358          pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg;
 359     double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
 360     double pipe_reg_gate_leakage = num_piperegs *
 361         pipe_reg.e_switch.readOp.gate_leakage;
 362     power.readOp.dynamic        += pipe_reg_power;
 363     power.readOp.leakage        += pipe_reg_leakage;
 364     power.readOp.gate_leakage   += pipe_reg_gate_leakage;
 365     area.set_area(num_piperegs * pipe_reg.area.get_area());
 366
 367     double long_channel_device_reduction =
 368         longer_channel_device_reduction(device_ty, coredynp.core_ty);
 369     power.readOp.longer_channel_leakage = power.readOp.leakage *
 370         long_channel_device_reduction;
 371
 372
 373     double sckRation = g_tp.sckt_co_eff;
 374     power.readOp.dynamic *= sckRation;
 375     power.writeOp.dynamic *= sckRation;
 376     power.searchOp.dynamic *= sckRation;
 377     double macro_layout_overhead = g_tp.macro_layout_overhead;
 378         if (!coredynp.Embedded)
 379                 area.set_area(area.get_area() * macro_layout_overhead);
 380
 381     output_data.area = area.get_area() / 1e6;
 382     output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
 383     output_data.subthreshold_leakage_power = power.readOp.leakage;
 384     output_data.gate_leakage_power = power.readOp.gate_leakage;
 385     output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles;
 386 }
 387
 388 void Pipeline::compute_stage_vector() {
 389     double num_stages, tot_stage_vector, per_stage_vector;
 390     int opcode_length = coredynp.x86 ?
 391         coredynp.micro_opcode_length : coredynp.opcode_width;
 392
 393     if (!is_core_pipeline) {
 394         //The number of pipeline stages are calculated based on the achievable
 395         //throughput and required throughput
 396         num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector;
 397     } else {
 398         if (coredynp.core_ty == Inorder) {
 399             /* assume 6 pipe stages and try to estimate bits per pipe stage */
 400             /* pipe stage 0/IF */
 401             num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads;
 402             /* pipe stage IF/ID */
 403             num_piperegs += coredynp.fetchW *
 404                 (coredynp.instruction_length + coredynp.pc_width) *
 405                 coredynp.num_hthreads;
 406             /* pipe stage IF/ThreadSEL */
 407             if (coredynp.multithreaded) {
 408                 num_piperegs += coredynp.num_hthreads *
 409                     coredynp.perThreadState; //8 bit thread states
 410             }
 411             /* pipe stage ID/EXE */
 412             num_piperegs += coredynp.decodeW *
 413                 (coredynp.instruction_length + coredynp.pc_width +
 414                  pow(2.0, opcode_length) + 2 * coredynp.int_data_width) *
 415                 coredynp.num_hthreads;
 416             /* pipe stage EXE/MEM */
 417             num_piperegs += coredynp.issueW *
 418                 (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 *
 419                  2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
 420             /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
 421             num_piperegs += coredynp.issueW *
 422                 (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 *
 423                  2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
 424             num_stages = 6;
 425         } else {
 426             /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
 427             /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
 428
 429             /* pipe stage 0/1F*/
 430             num_piperegs +=
 431                 coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC
 432             /* pipe stage IF/ID */
 433             num_piperegs += coredynp.fetchW *
 434                 (coredynp.instruction_length + coredynp.pc_width) *
 435                 coredynp.num_hthreads;//PC is used to feed branch predictor in ID
 436             /* pipe stage 1D/Renaming*/
 437             num_piperegs += coredynp.decodeW *
 438                 (coredynp.instruction_length + coredynp.pc_width) *
 439                 coredynp.num_hthreads;//PC is for branch exe in later stage.
 440             /* pipe stage Renaming/wire_drive */
 441             num_piperegs += coredynp.decodeW *
 442                 (coredynp.instruction_length + coredynp.pc_width);
 443             /* pipe stage Renaming/IssueQ */
 444             //3*coredynp.phy_ireg_width means 2 sources and 1 dest
 445             num_piperegs += coredynp.issueW *
 446                 (coredynp.instruction_length  + coredynp.pc_width + 3 *
 447                  coredynp.phy_ireg_width) * coredynp.num_hthreads;
 448             /* pipe stage IssueQ/Dispatch */
 449             num_piperegs += coredynp.issueW *
 450                 (coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
 451             /* pipe stage Dispatch/EXE */
 452
 453             num_piperegs += coredynp.issueW *
 454                 (3 * coredynp.phy_ireg_width + coredynp.pc_width +
 455                  pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
 456             /* 2^opcode_length means the total decoded signal for the opcode*/
 457             num_piperegs += coredynp.issueW *
 458                 (2 * coredynp.int_data_width + pow(2.0, opcode_length)
 459                  /*+2*powers (2,reg_length)*/);
 460             /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
 461             num_piperegs += coredynp.issueW *
 462                 (2 * coredynp.int_data_width + pow(2.0, opcode_length)
 463                  /*+2*powers (2,reg_length)*/);
 464             /* pipe stage EXE/MEM, data need to be read/write, address*/
 465             //memory Opcode still need to be passed
 466             num_piperegs += coredynp.issueW *
 467                 (coredynp.int_data_width + coredynp.v_address_width +
 468                  pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
 469             /* pipe stage MEM/WB; result data, writeback regs */
 470             num_piperegs += coredynp.issueW *
 471                 (coredynp.int_data_width + coredynp.phy_ireg_width
 472                  /* powers (2,opcode_length) +
 473                     (2,opcode_length)+2*powers (2,reg_length)*/);
 474             /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
 475             num_piperegs += coredynp.commitW *
 476                 (coredynp.int_data_width + coredynp.v_address_width +
 477                  coredynp.phy_ireg_width
 478                  /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
 479                 coredynp.num_hthreads;
 480             num_stages = 12;
 481
 482         }
 483
 484         /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
 485         num_piperegs = num_piperegs * 1.5;
 486         tot_stage_vector = num_piperegs;
 487         per_stage_vector = tot_stage_vector / num_stages;
 488
 489         if (coredynp.core_ty == Inorder) {
 490             if (coredynp.pipeline_stages > 6)
 491                 num_piperegs = per_stage_vector * coredynp.pipeline_stages;
 492         } else { //OOO
 493             if (coredynp.pipeline_stages > 12)
 494                 num_piperegs = per_stage_vector * coredynp.pipeline_stages;
 495         }
 496     }
 497
 498 }
 499
 500 FunctionalUnit::FunctionalUnit(XMLNode* _xml_data,
 501                                InputParameter* interface_ip_,
 502                                const CoreParameters & _core_params,
 503                                const CoreStatistics & _core_stats,
 504                                enum FU_type fu_type_)
 505     : McPATComponent(_xml_data),
 506       interface_ip(*interface_ip_), core_params(_core_params),
 507       core_stats(_core_stats), fu_type(fu_type_) {
 508     double area_t;
 509     double leakage;
 510     double gate_leakage;
 511     double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
 512     clockRate = core_params.clockRate;
 513
 514     uca_org_t result2;
 515     // Temp name for the following function call
 516     name = "Functional Unit";
 517
 518     result2 = init_interface(&interface_ip, name);
 519
 520         if (core_params.Embedded) {
 521             if (fu_type == FPU) {
 522                 num_fu=core_params.num_fpus;
 523                         //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
 524                         area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
 525                         //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
 526                         if (g_ip->F_sz_nm>90)
 527                                 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
 528                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 529                         gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 530                         //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
 531 //                      base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
 532 //                      base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
 533                         base_energy = 0;
 534                         per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
 535                         //FPU power from Sandia's processor sizing tech report
 536                         FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
 537             } else if (fu_type == ALU) {
 538                 num_fu=core_params.num_alus;
 539                         area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
 540                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 541                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 542 //                      base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
 543 //                      base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
 544                         base_energy = 0;
 545                         per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
 546                         FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
 547
 548             } else if (fu_type == MUL) {
 549                 num_fu=core_params.num_muls;
 550                         area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
 551                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 552                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 553 //                      base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
 554 //                      base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
 555                         base_energy = 0;
 556                         per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
 557                         FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
 558             } else {
 559                         cout<<"Unknown Functional Unit Type"<<endl;
 560                         exit(0);
 561                 }
 562                 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
 563         } else {
 564             if (fu_type == FPU) {
 565                 name = "Floating Point Unit(s)";
 566                 num_fu = core_params.num_fpus;
 567                 area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
 568                                        90.0);//this is um^2
 569                 if (g_ip->F_sz_nm > 90)
 570                     area_t = 8.47 * 1e6 *
 571                         g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
 572             leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 573             gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 574             //W The base energy of ALU average numbers from Intel 4G and
 575             //773Mhz (Wattch)
 576             base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3;
 577             base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
 578                             1.2);
 579             per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
 580             FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
 581         } else if (fu_type == ALU) {
 582             name = "Integer ALU(s)";
 583             num_fu = core_params.num_alus;
 584             //this is um^2 ALU + MUl
 585             area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff;
 586             leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 587             gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 588             //W The base energy of ALU average numbers from Intel 4G and 773Mhz
 589             //(Wattch)
 590             base_energy = core_params.core_ty == Inorder ? 0 : 89e-3;
 591             base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
 592                             1.2);
 593             per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
 594             FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
 595         } else if (fu_type == MUL) {
 596             name = "Multiply/Divide Unit(s)";
 597             num_fu = core_params.num_muls;
 598             //this is um^2 ALU + MUl
 599             area_t = 280 * 260 * 2 * 3 *
 600                 g_tp.scaling_factor.logic_scaling_co_eff;
 601             leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 602             gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 603             //W The base energy of ALU average numbers from Intel 4G and 773Mhz
 604             //(Wattch)
 605             base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2;
 606             base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
 607                             1.2);
 608             per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
 609             FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
 610         } else {
 611             cout << "Unknown Functional Unit Type" << endl;
 612             exit(0);
 613         }
 614     }
 615
 616     area.set_area(area_t*num_fu);
 617     power.readOp.leakage = leakage * num_fu;
 618     power.readOp.gate_leakage = gate_leakage * num_fu;
 619
 620     double long_channel_device_reduction =
 621         longer_channel_device_reduction(Core_device, core_params.core_ty);
 622     power.readOp.longer_channel_leakage =
 623         power.readOp.leakage * long_channel_device_reduction;
 624     double macro_layout_overhead = g_tp.macro_layout_overhead;
 625     area.set_area(area.get_area()*macro_layout_overhead);
 626 }
 627
 628 void FunctionalUnit::computeEnergy() {
 629     double pppm_t[4]    = {1, 1, 1, 1};
 630     double FU_duty_cycle;
 631     double sckRation = g_tp.sckt_co_eff;
 632
 633     // TDP power calculation
 634     //2 means two source operands needs to be passed for each int instruction.
 635     set_pppm(pppm_t, 2, 2, 2, 2);
 636     tdp_stats.readAc.access = num_fu;
 637     if (fu_type == FPU) {
 638         FU_duty_cycle = core_stats.FPU_duty_cycle;
 639     } else if (fu_type == ALU) {
 640         FU_duty_cycle = core_stats.ALU_duty_cycle;
 641     } else if (fu_type == MUL) {
 642         FU_duty_cycle = core_stats.MUL_duty_cycle;
 643     }
 644
 645     power.readOp.dynamic =
 646         per_access_energy * tdp_stats.readAc.access + base_energy / clockRate;
 647     power.readOp.dynamic *= sckRation * FU_duty_cycle;
 648
 649     // Runtime power calculation
 650     if (fu_type == FPU) {
 651         rtp_stats.readAc.access = core_stats.fpu_accesses;
 652     } else if (fu_type == ALU) {
 653         rtp_stats.readAc.access = core_stats.ialu_accesses;
 654     } else if (fu_type == MUL) {
 655         rtp_stats.readAc.access = core_stats.mul_accesses;
 656     }
 657
 658     rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access +
 659         base_energy * execution_time;
 660     rt_power.readOp.dynamic *= sckRation;
 661
 662     output_data.area = area.get_area() / 1e6;
 663     output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
 664     output_data.subthreshold_leakage_power =
 665         (longer_channel_device) ? power.readOp.longer_channel_leakage :
 666         power.readOp.leakage;
 667     output_data.gate_leakage_power = power.readOp.gate_leakage;
 668     output_data.runtime_dynamic_energy = rt_power.readOp.dynamic;
 669 }
 670
 671 void FunctionalUnit::leakage_feedback(double temperature)
 672 {
 673   // Update the temperature and initialize the global interfaces.
 674   interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
 675
 676   // init_result is dummy
 677   uca_org_t init_result = init_interface(&interface_ip, name);
 678
 679   // This is part of FunctionalUnit()
 680   double area_t, leakage, gate_leakage;
 681   double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
 682
 683   if (fu_type == FPU)
 684   {
 685         area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
 686         if (g_ip->F_sz_nm>90)
 687                 area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
 688         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 689         gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 690   }
 691   else if (fu_type == ALU)
 692   {
 693     area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
 694     leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 695     gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 696   }
 697   else if (fu_type == MUL)
 698   {
 699     area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
 700     leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
 701     gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
 702   }
 703   else
 704   {
 705     cout<<"Unknown Functional Unit Type"<<endl;
 706     exit(1);
 707   }
 708
 709   power.readOp.leakage = leakage*num_fu;
 710   power.readOp.gate_leakage = gate_leakage*num_fu;
 711   power.readOp.longer_channel_leakage =
 712       longer_channel_device_reduction(Core_device, core_params.core_ty);
 713 }
 714
 715 UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_,
 716                        const CoreParameters & dyn_p_,
 717                        bool exist_)
 718         : McPATComponent(_xml_data),
 719         interface_ip(*interface_ip_), coredynp(dyn_p_),
 720         core_ty(coredynp.core_ty), embedded(coredynp.Embedded),
 721         pipeline_stage(coredynp.pipeline_stages),
 722         num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW),
 723         exist(exist_) {
 724     if (!exist) return;
 725
 726     name = "Undifferentiated Core";
 727     clockRate = coredynp.clockRate;
 728
 729     double undifferentiated_core = 0;
 730     double core_tx_density = 0;
 731     double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
 732         double undifferentiated_core_coe;
 733     uca_org_t result2;
 734     result2 = init_interface(&interface_ip, name);
 735
 736     //Compute undifferentiated core area at 90nm.
 737     if (embedded == false) {
 738         //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
 739         if (core_ty == OOO) {
 740             undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ?
 741                 (3.57 * log(pipeline_stage) - 1.2643) : 0;
 742         } else if (core_ty == Inorder) {
 743             undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ?
 744                 (-2.19 * log(pipeline_stage) + 6.55) : 0;
 745         } else {
 746             cout << "invalid core type" << endl;
 747             exit(0);
 748         }
 749         undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716);
 750     } else {
 751         //Based on the results in paper "parametrized processor models" Sandia Labs
 752                 if (opt_for_clk)
 753                         undifferentiated_core_coe = 0.05;
 754                 else
 755                         undifferentiated_core_coe = 0;
 756                 undifferentiated_core = (0.4109 * pipeline_stage - 0.776) *
 757                     undifferentiated_core_coe;
 758                 undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426);
 759     }
 760
 761     undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff *
 762         1e6;//change from mm^2 to um^2
 763     core_tx_density                 = g_tp.scaling_factor.core_tx_density;
 764     power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
 765     power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;
 766
 767     double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
 768     power.readOp.longer_channel_leakage =
 769         power.readOp.leakage * long_channel_device_reduction;
 770     area.set_area(undifferentiated_core);
 771
 772     scktRatio = g_tp.sckt_co_eff;
 773     power.readOp.dynamic *= scktRatio;
 774     power.writeOp.dynamic *= scktRatio;
 775     power.searchOp.dynamic *= scktRatio;
 776     macro_PR_overhead = g_tp.macro_layout_overhead;
 777     area.set_area(area.get_area()*macro_PR_overhead);
 778
 779     output_data.area = area.get_area() / 1e6;
 780     output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
 781     output_data.subthreshold_leakage_power =
 782         longer_channel_device ? power.readOp.longer_channel_leakage :
 783         power.readOp.leakage;
 784     output_data.gate_leakage_power = power.readOp.gate_leakage;
 785 }
 786
 787 InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name,
 788                                        bool _is_default,
 789                                        const InputParameter *configure_interface,
 790                                        int opcode_length_, int num_decoders_,
 791                                        bool x86_,
 792                                        double clockRate_,
 793                                        enum Device_ty device_ty_,
 794                                        enum Core_type core_ty_)
 795     : McPATComponent(_xml_data), is_default(_is_default),
 796       opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_),
 797       device_ty(device_ty_), core_ty(core_ty_) {
 798     /*
 799      * Instruction decoder is different from n to 2^n decoders
 800      * that are commonly used in row decoders in memory arrays.
 801      * The RISC instruction decoder is typically a very simple device.
 802      * We can decode an instruction by simply
 803      * separating the machine word into small parts using wire slices
 804      * The RISC instruction decoder can be approximate by the n to 2^n decoders,
 805      * although this approximation usually underestimate power since each decoded
 806      * instruction normally has more than 1 active signal.
 807      *
 808      * However, decoding a CISC instruction word is much more difficult
 809      * than the RISC case. A CISC decoder is typically set up as a state machine.
 810      * The machine reads the opcode field to determine
 811      * what type of instruction it is,
 812      * and where the other data values are.
 813      * The instruction word is read in piece by piece,
 814      * and decisions are made at each stage as to
 815      * how the remainder of the instruction word will be read.
 816      * (sequencer and ROM are usually needed)
 817      * An x86 decoder can be even more complex since
 818      * it involve  both decoding instructions into u-ops and
 819      * merge u-ops when doing micro-ops fusion.
 820      */
 821     name = _name;
 822     clockRate = clockRate_;
 823     bool is_dram = false;
 824     double pmos_to_nmos_sizing_r;
 825     double load_nmos_width, load_pmos_width;
 826     double C_driver_load, R_wire_load;
 827     Area cell;
 828
 829     l_ip = *configure_interface;
 830     local_result = init_interface(&l_ip, name);
 831     cell.h = g_tp.cell_h_def;
 832     cell.w = g_tp.cell_h_def;
 833
 834     num_decoder_segments = (int)ceil(opcode_length / 18.0);
 835     if (opcode_length > 18)     opcode_length = 18;
 836     num_decoded_signals = (int)pow(2.0, opcode_length);
 837     pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
 838     load_nmos_width = g_tp.max_w_nmos_ / 2;
 839     load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
 840     C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram);
 841     R_wire_load   = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
 842
 843     final_dec = new Decoder(
 844         num_decoded_signals,
 845         false,
 846         C_driver_load,
 847         R_wire_load,
 848         false/*is_fa*/,
 849         false/*is_dram*/,
 850         false/*wl_tr*/, //to use peri device
 851         cell);
 852
 853     PredecBlk * predec_blk1 = new PredecBlk(
 854         num_decoded_signals,
 855         final_dec,
 856         0,//Assuming predec and dec are back to back
 857         0,
 858         1,//Each Predec only drives one final dec
 859         false/*is_dram*/,
 860         true);
 861     PredecBlk * predec_blk2 = new PredecBlk(
 862         num_decoded_signals,
 863         final_dec,
 864         0,//Assuming predec and dec are back to back
 865         0,
 866         1,//Each Predec only drives one final dec
 867         false/*is_dram*/,
 868         false);
 869
 870     PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
 871     PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
 872
 873     pre_dec            = new Predec(predec_blk_drv1, predec_blk_drv2);
 874
 875     double area_decoder = final_dec->area.get_area() * num_decoded_signals *
 876         num_decoder_segments * num_decoders;
 877     //double w_decoder    = area_decoder / area.get_h();
 878     double area_pre_dec = (predec_blk_drv1->area.get_area() +
 879                            predec_blk_drv2->area.get_area() +
 880                            predec_blk1->area.get_area() +
 881                            predec_blk2->area.get_area()) *
 882                           num_decoder_segments * num_decoders;
 883     area.set_area(area.get_area() + area_decoder + area_pre_dec);
 884     double macro_layout_overhead   = g_tp.macro_layout_overhead;
 885     double chip_PR_overhead        = g_tp.chip_layout_overhead;
 886     area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
 887
 888     inst_decoder_delay_power();
 889
 890     double sckRation = g_tp.sckt_co_eff;
 891     power.readOp.dynamic *= sckRation;
 892     power.writeOp.dynamic *= sckRation;
 893     power.searchOp.dynamic *= sckRation;
 894
 895     double long_channel_device_reduction =
 896         longer_channel_device_reduction(device_ty, core_ty);
 897     power.readOp.longer_channel_leakage = power.readOp.leakage *
 898         long_channel_device_reduction;
 899
 900     output_data.area = area.get_area() / 1e6;
 901     output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
 902     output_data.subthreshold_leakage_power = power.readOp.leakage;
 903     output_data.gate_leakage_power = power.readOp.gate_leakage;
 904 }
 905
 906 void InstructionDecoder::inst_decoder_delay_power() {
 907
 908     double dec_outrisetime;
 909     double inrisetime = 0, outrisetime;
 910     double pppm_t[4]    = {1, 1, 1, 1};
 911     double squencer_passes = x86 ? 2 : 1;
 912
 913     outrisetime = pre_dec->compute_delays(inrisetime);
 914     dec_outrisetime = final_dec->compute_delays(outrisetime);
 915     set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
 916     power = power + pre_dec->power * pppm_t;
 917     set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
 918              num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
 919     power = power + final_dec->power * pppm_t;
 920 }
 921
 922 void InstructionDecoder::leakage_feedback(double temperature) {
 923   l_ip.temp = (unsigned int)round(temperature/10.0)*10;
 924   uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
 925
 926   final_dec->leakage_feedback(temperature);
 927   pre_dec->leakage_feedback(temperature);
 928
 929   double pppm_t[4]    = {1,1,1,1};
 930   double squencer_passes = x86?2:1;
 931
 932   set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
 933   power = pre_dec->power*pppm_t;
 934
 935   set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
 936   power = power + final_dec->power*pppm_t;
 937
 938   double sckRation = g_tp.sckt_co_eff;
 939
 940   power.readOp.dynamic *= sckRation;
 941   power.writeOp.dynamic *= sckRation;
 942   power.searchOp.dynamic *= sckRation;
 943
 944   double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
 945   power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
 946 }
 947
 948 InstructionDecoder::~InstructionDecoder() {
 949     local_result.cleanup();
 950
 951     delete final_dec;
 952
 953     delete pre_dec->blk1;
 954     delete pre_dec->blk2;
 955     delete pre_dec->drv1;
 956     delete pre_dec->drv2;
 957     delete pre_dec;
 958 }